# Home Credit Default Risk 2018

In [None]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
import gc
import time
import warnings
from tqdm import tqdm
warnings.simplefilter(action = 'ignore', category = FutureWarning)
try:
    import cPickle as pickle
except:
    import pickle
import os

In [None]:
RANDOM_STATE = 142
np.random.seed(RANDOM_STATE)

In [None]:
file_path = '../input/'
source_path = '../stack_it/stack_source/'
result_path = '../stack_it/stacking_results/'

In [None]:
submission_df = pd.read_csv(f'{file_path}sample_submission.csv')

In [None]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
from lightgbm import LGBMClassifier

In [None]:
from scipy.stats import gmean

In [None]:
from bayes_opt import BayesianOptimization

### Service functions

In [None]:
def convert_col_to_proper_int(df_col):
    col_type = df_col.dtype
#     print('convert_col_to_proper_int column: ', df_col.name, 'type: ', col_type, 'c_min: ', c_min)
    if ((str(col_type)[:3] == 'int') | (str(col_type)[:4] == 'uint')): # | (str(col_type)[:5] == 'float')
        c_min = df_col.min()
        c_max = df_col.max()
        if c_min < 0:
#             print('c_min: ', c_min, 'less 0')
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df_col = df_col.astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df_col = df_col.astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df_col = df_col.astype(np.int32)
            elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                df_col = df_col.astype(np.int64)
        else:
#             print('c_min: ', c_min, 'not less 0')
            if c_max <= np.iinfo(np.uint8).max:
                df_col = df_col.astype(np.uint8)
            elif c_max <= np.iinfo(np.uint16).max:
                df_col = df_col.astype(np.uint16)
            elif c_max <= np.iinfo(np.uint32).max:
                df_col = df_col.astype(np.uint32)
            elif c_max <= np.iinfo(np.uint64).max:
                df_col = df_col.astype(np.uint64)
            
    return df_col

def convert_col_to_proper_float(df_col):
    col_type = df_col.dtype
    if str(col_type)[:5] == 'float':
        unique_count = len(np.unique(df_col))
        df_col_temp = df_col.astype(np.float32)
        if len(np.unique(df_col_temp)) == unique_count:
            df_col = df_col_temp
            c_min = df_col.min()
            c_max = df_col.max()
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df_col_temp = df_col.astype(np.float16)
                if len(np.unique(df_col_temp)) == unique_count:
                    df_col = df_col_temp
            
    return df_col



def float_to_int(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_to_int')
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            if (df[col] % 1 == 0).all():
                df[col] = convert_col_to_proper_int(df[col].astype(np.int64))
    
    return df

def float_reduced(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_reduced')
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            df[col] = convert_col_to_proper_float(df[col])
    
    return df

def int_reduced(df):
    """ iterate through all int columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_reduced')
    for col in df.columns:
        df[col] = convert_col_to_proper_int(df[col])
    
    return df

## Thanks You Guillaume Martin for the Awesome Memory Optimizer!
## https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(data, verbose = True):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))

    for col in data.columns:
#         print(col, type(data[col]), data[col].shape)
        col_type = data[col].dtype

        if ((col_type != object) & (col_type != '<M8[ns]') & (col_type.name != 'category')):#
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                data[col] = convert_col_to_proper_int(data[col])
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else: data[col] = data[col].astype('category')

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return data

def gentle_reduce_mem_usage(data, verbose = True):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))

    for col in data.columns:
#         print(col, type(data[col]), data[col].shape)
        col_type = data[col].dtype

        if ((col_type != object) & (col_type != '<M8[ns]') & (col_type.name != 'category')):#
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                data[col] = convert_col_to_proper_int(data[col])
            else:
                if (data[col] % 1 == 0).all():
                    data[col] = convert_col_to_proper_int(data[col].astype(np.int64))
                else:
                    data[col] = convert_col_to_proper_float(data[col])
        else: data[col] = data[col].astype('category')

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return data

## Load data

In [None]:
%%time
df = pd.read_pickle(f'{file_path}models_df07.pkl.zip')
df.reset_index(inplace = True)
df.info()

In [None]:
df = df[['SK_ID_CURR', 'TARGET',
 'konyshev_0775CV',
 'ppleskov_neptune_LB0802',
#  'leonid26_638feats_5CV_078196',
 'tEarth_pred10_CV0794662',
#  'leonid22_719feats_5CV_079012',
 'tEarth_pred11_LB0798',
#  'leonid08_5CV_078551',
#  'leonid16_1293feats_5CV_079173',
#  'leonid21_315feats_5CV_079244',
#  'leonid24_1350feats_5CV_079194',
#  'leonid23_285feats_5CV_079546',
#  'leonid13_5CV_079019',
 'iv_079690CV_xxxxPL',
 'iv_079581CV_0795PL',
#  'leonid12_5CV_079137',
#  'leonid17_1155feats_5CV_079172',
 'iv_079634CV_0794PL']]

## Optimization LGBM parameters

### Optimization and visualisation functions

In [None]:
def rank_average_df(data):
    for key in data.columns:
        data[str(key) + '_rank'] = data[key].rank()
    data['rank_sum'] = np.sum(data[col] for col in data.columns if '_rank' in str(col))
#     print(data.shape[0], len([col for col in data.columns if '_rank' in str(col)]))
    data['TARGET'] = data['rank_sum']/(len([col for col in data.columns if '_rank' in str(col)]) *
            data.shape[0])
    return data['TARGET']

In [None]:
def cv_scores(df, num_folds, params, model_name, stratified = False, verbose = -1,
              early_stopping = 300,
              save_train_prediction = True, save_test_prediction = True, train_full_model=False,
              folder_to_save='../stack_it/', seed = 42, submission_sample = submission_df):
    warnings.simplefilter('ignore')
    
#     clf = LGBMClassifier(class_weight = 'balanced', importance_type = 'gain',
#                          random_state = RANDOM_STATE, **params)

    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = seed)
    else:
        folds = KFold(n_splits = num_folds, shuffle = True, random_state = seed)
        
    # Create arrays and dataframes to store results
#     train_pred = np.zeros(train_df.shape[0])
#     train_pred_proba = np.zeros(train_df.shape[0])

#     valid_pred = np.zeros(train_df.shape[0])
    valid_pred_proba = np.zeros(train_df.shape[0])
    
#     test_prediction = np.zeros(test_df.shape[0])
    test_prediction = pd.DataFrame(index = submission_sample['SK_ID_CURR'])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    df_feature_importance = pd.DataFrame(index = feats)
    
    iterations = np.zeros(num_folds, dtype=np.uint16)
    train_scores = np.zeros(num_folds, dtype=np.float32)
    fold_scores = np.zeros(num_folds, dtype=np.float32)
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        clf = LGBMClassifier(importance_type = 'gain',# class_weight = 'balanced',
                             random_state = (RANDOM_STATE + n_fold), **params)
        print('Fold', n_fold, 'started at', time.ctime())
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc',
                eval_names = [f'{n_fold}_train', f'{n_fold}_valid'],
                verbose = verbose, early_stopping_rounds = early_stopping)
        iterations[n_fold] = clf.best_iteration_

#         train_pred[train_idx] = clf.predict(train_x, num_iteration = clf.best_iteration_)
#         train_pred_proba[train_idx] = clf.predict_proba(train_x, num_iteration = clf.best_iteration_)[:, 1]
#         valid_pred[valid_idx] = clf.predict(valid_x, num_iteration = clf.best_iteration_)
        valid_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
#         test_prediction += \
#                 clf.predict_proba(test_df[feats], num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits
        test_prediction[n_fold] = pd.Series(clf.predict_proba(test_df[feats], num_iteration = clf.best_iteration_)[:, 1],
                                            index = submission_sample['SK_ID_CURR'])

        df_feature_importance[n_fold] = pd.Series(clf.feature_importances_, index = feats)
#         print(clf.booster_.attr('max_depth'))
        train_scores[n_fold] = clf.evals_result_[f'{n_fold}_train']['auc'][clf.best_iteration_-1]
        fold_scores[n_fold] = roc_auc_score(valid_y, valid_pred_proba[valid_idx])
        print('Fold %2d train AUC: %.6f, valid AUC: %.6f; best iteration %5d;' % (n_fold,
                                                                                  train_scores[n_fold],
                                                                                  fold_scores[n_fold],
                                                                                  iterations[n_fold]))
        del train_x, train_y, valid_x, valid_y
        gc.collect()

#     roc_auc_train = roc_auc_score(train_df['TARGET'], train_pred_proba)
#     precision_train = precision_score(train_df['TARGET'], train_pred, average = None)
#     recall_train = recall_score(train_df['TARGET'], train_pred, average = None)
    
    roc_auc_valid = roc_auc_score(train_df['TARGET'], valid_pred_proba)
#     precision_valid = precision_score(train_df['TARGET'], valid_pred, average = None)
#     recall_valid = recall_score(train_df['TARGET'], valid_pred, average = None)

    print('Full AUC score %.6f' % roc_auc_valid)
    print(f'AUC score across {num_folds} folds is mean {fold_scores.mean():.4f} with std {fold_scores.std():.4f}',
          f'(train score mean {train_scores.mean():.4f} with std {train_scores.std():.4f})')
    
    df_feature_importance.fillna(0, inplace = True)
    df_feature_importance['mean'] = df_feature_importance.mean(axis = 1)
    important_feats = df_feature_importance[df_feature_importance['mean']>0]\
                        .sort_values(by='mean', ascending = False).index.tolist()

    model_description = f'{model_name}_{len(feats)}feats_{num_folds}CV_{roc_auc_valid*1e5:0>6.0f}'
    # Write prediction files
    if save_train_prediction:
#         df_prediction = train_df[['SK_ID_CURR', 'TARGET']]
        df_prediction = pd.DataFrame({'SK_ID_CURR': train_df['SK_ID_CURR'].values,
                                      'TARGET': valid_pred_proba})
        train_prediction_file_name = f'{folder_to_save}{model_description}_oof_train.csv'
        print(f'Saving oof train predictions to {train_prediction_file_name}')
        df_prediction.to_csv(train_prediction_file_name, index = False)
        del df_prediction
        gc.collect()

    if save_test_prediction:
        test_predictions_file_name = f'{folder_to_save}{model_description}_{num_folds}_test_preds.csv'
        print(f'Saving {num_folds} folds test predictions to {test_predictions_file_name}')
        test_prediction.to_csv(test_predictions_file_name, index = False)
        
        df_prediction = pd.DataFrame({'SK_ID_CURR': submission_sample['SK_ID_CURR'].values,
                                      'TARGET': test_prediction.mean(axis = 1)})
        test_mean_prediction_file_name = f'{folder_to_save}{model_description}_test_mean.csv'
        print(f'Saving mean of {num_folds} test predictions to {test_mean_prediction_file_name}')
        df_prediction.to_csv(test_mean_prediction_file_name, index = False)

        df_prediction = pd.DataFrame({'SK_ID_CURR': submission_sample['SK_ID_CURR'].values,
                                      'TARGET': gmean(test_prediction, axis=1)})
        test_gmean_prediction_file_name = f'{folder_to_save}{model_description}_test_gmean.csv'
        print(f'Saving geometric mean of {num_folds} test predictions to {test_gmean_prediction_file_name}')
        df_prediction.to_csv(test_gmean_prediction_file_name, index = False)
       
        df_prediction = pd.DataFrame({'SK_ID_CURR': submission_sample['SK_ID_CURR'].values,
                                      'TARGET': rank_average_df(test_prediction)})
        test_ranked_prediction_file_name = f'{folder_to_save}{model_description}_test_rank_averaged.csv'
        print(f'Saving rank average of {num_folds} test predictions to {test_ranked_prediction_file_name}')
        df_prediction.to_csv(test_ranked_prediction_file_name, index = False)

        del df_prediction
        gc.collect()
    
    if train_full_model:
        print(f'Full train learning for {iterations.max()} iterations',
              f'on {len(important_feats)} features of original {len(feats)}',
              'started at', time.ctime())
        train_x, train_y = train_df[important_feats], train_df['TARGET']
        params['n_estimators'] = iterations.max()
        clf_full = LGBMClassifier(importance_type = 'gain', #class_weight = 'balanced',
                                 random_state = RANDOM_STATE, **params)
        clf_full.fit(train_x, train_y,
                verbose = verbose)


        fulltrained_model_description = f'{model_name}_{len(important_feats)}feats_{num_folds}CV_{roc_auc_valid*1e5:0>6.0f}'
        if save_test_prediction:
    #         df_prediction = test_df[['SK_ID_CURR', 'TARGET']]
            test_full_prediction_file_name = f'{folder_to_save}{fulltrained_model_description}_test_fulltrained.csv'
            print(f'Saving fulltrained test predictions to {test_full_prediction_file_name}')
            df_prediction = pd.DataFrame({'SK_ID_CURR': submission_sample['SK_ID_CURR'].values,
                                          'TARGET': clf_full.predict_proba(test_df[important_feats])[:, 1]})

    #         df_prediction['TARGET'] = pd.Series(clf_full.predict_proba(test_df[important_feats])[:, 1],
    #                                             index = test_df['SK_ID_CURR'])
            df_prediction.to_csv(test_full_prediction_file_name, index = False)

            del df_prediction
            gc.collect()
    
    return df_feature_importance, \
            roc_auc_valid
#            [#roc_auc_train,
#             roc_auc_valid,
#             precision_train[0], precision_valid[0], precision_train[1], precision_test[1],
#             recall_train[0], recall_test[0], recall_train[1], recall_test[1], 0]

### LightGBM stacking

In [None]:
%%time
lgbm_params = {
            'objective': 'binary',
            'nthread': 4,
            'n_estimators': 10000,
            'max_depth': -1,
            'boosting_type': 'gbdt',#goss
            'learning_rate': 0.005,#0.02
            'num_leaves': 600,
            'max_bin': 255,#255 #100
            'colsample_bytree': 0.48,#0.05,#1.0
            'reg_lambda': 600,#1000
            'bagging_fraction': 0.35,#0.15
            'bagging_freq': 15,
            'min_data_in_leaf': 375,#70
            'min_gain_to_split': 0.5,
#             'reg_alpha': .041545473,
#             'min_child_weight': 39.3259775,
            'silent': -1,
            'verbose': -1
}


feature_importance, scor = cv_scores(df, 5, lgbm_params, model_name = 'leonid40latestack',
                                     save_train_prediction = True,
                                     verbose = 200, early_stopping=1000, folder_to_save = result_path,
                                     stratified = True, seed = RANDOM_STATE)

In [None]:
feature_importance.sort_values('mean', ascending = False).index.tolist()

In [None]:
feature_importance.sort_values('mean', ascending = False).head(30)

In [None]:
df[df['TARGET'].isnull()][[f for f in df.columns.tolist() if f not in ['SK_ID_CURR', 'TARGET']]].corr()

In [None]:
# data = pd.DataFrame({})
# for file in os.listdir('./blend/'):
#     df = pd.read_csv('./blend/'+file)
#     df = df.rename_axis({'target':file},axis=1)
#     data = pd.concat([data, df], axis=1)
# data.corr()

In [None]:
# import pandas.rpy.common as com
# import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# # load the R package ISLR
# infert = com.importr("ISLR")

# # load the Auto dataset
# auto_df = com.load_data('Auto')

# calculate the correlation matrix
# corr = auto_df.corr()
corr = df[df['TARGET'].isnull()][[f for f in df.columns.tolist() if f not in ['SK_ID_CURR', 'TARGET']]].corr()

# plot the heatmap
# plt.figure(figsize=(16,9))
sns.set(rc={'figure.figsize':(16,12)})
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:
corr = df[df['TARGET'].notnull()][[f for f in df.columns.tolist() if f not in ['SK_ID_CURR', 'TARGET']]].corr()

# plot the heatmap
# plt.figure(figsize=(16,9))
sns.set(rc={'figure.figsize':(16,12)})
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)