# Home Credit Default Risk 2018

In [1]:
import numpy as np
import pandas as pd
import gc
import time
import warnings
from tqdm import tqdm
warnings.simplefilter(action = 'ignore', category = FutureWarning)
try:
    import cPickle as pickle
except:
    import pickle
import os

In [2]:
RANDOM_STATE = 2042
np.random.seed(RANDOM_STATE)

In [3]:
file_path = '../input/'

In [4]:
submission_df = pd.read_csv(f'{file_path}sample_submission.csv')

In [5]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold

In [6]:
# from dask.distributed import Client

# # If you have a remote cluster running Dask
# # client = Client('tcp://scheduler-address:8786')

# # If you want Dask to set itself up on your personal computer
# client = Client(processes=False)

# from joblib import parallel_backend, Parallel, delayed

In [7]:
from lightgbm import LGBMClassifier

In [8]:
from scipy.stats import ranksums
from scipy.stats import gmean

In [9]:
from bayes_opt import BayesianOptimization

In [10]:
USELESS_COLUMNS = ['FLAG_DOCUMENT_10',
                   'FLAG_DOCUMENT_12',
                   'FLAG_DOCUMENT_13',
                   'FLAG_DOCUMENT_14',
                   'FLAG_DOCUMENT_15',
                   'FLAG_DOCUMENT_16',
                   'FLAG_DOCUMENT_17',
                   'FLAG_DOCUMENT_19',
                   'FLAG_DOCUMENT_2',
                   'FLAG_DOCUMENT_20',
                   'FLAG_DOCUMENT_21']

## Aggregating datasets

### Service functions

In [11]:
def convert_col_to_proper_int(df_col):
    col_type = df_col.dtype
#     print('convert_col_to_proper_int column: ', df_col.name, 'type: ', col_type, 'c_min: ', c_min)
    if ((str(col_type)[:3] == 'int') | (str(col_type)[:4] == 'uint')): # | (str(col_type)[:5] == 'float')
        c_min = df_col.min()
        c_max = df_col.max()
        if c_min < 0:
#             print('c_min: ', c_min, 'less 0')
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df_col = df_col.astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df_col = df_col.astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df_col = df_col.astype(np.int32)
            elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                df_col = df_col.astype(np.int64)
        else:
#             print('c_min: ', c_min, 'not less 0')
            if c_max <= np.iinfo(np.uint8).max:
                df_col = df_col.astype(np.uint8)
            elif c_max <= np.iinfo(np.uint16).max:
                df_col = df_col.astype(np.uint16)
            elif c_max <= np.iinfo(np.uint32).max:
                df_col = df_col.astype(np.uint32)
            elif c_max <= np.iinfo(np.uint64).max:
                df_col = df_col.astype(np.uint64)
            
    return df_col

def convert_col_to_proper_float(df_col):
    col_type = df_col.dtype
    if str(col_type)[:5] == 'float':
        unique_count = len(np.unique(df_col))
        df_col_temp = df_col.astype(np.float32)
        if len(np.unique(df_col_temp)) == unique_count:
            df_col = df_col_temp
            c_min = df_col.min()
            c_max = df_col.max()
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df_col_temp = df_col.astype(np.float16)
                if len(np.unique(df_col_temp)) == unique_count:
                    df_col = df_col_temp
            
    return df_col



def float_to_int(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_to_int')
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            if (df[col] % 1 == 0).all():
                df[col] = convert_col_to_proper_int(df[col].astype(np.int64))
    
    return df

def float_reduced(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_reduced')
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            df[col] = convert_col_to_proper_float(df[col])
    
    return df

def int_reduced(df):
    """ iterate through all int columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_reduced')
    for col in df.columns:
        df[col] = convert_col_to_proper_int(df[col])
    
    return df

## Thanks You Guillaume Martin for the Awesome Memory Optimizer!
## https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(data, verbose = True):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))

    for col in data.columns:
#         print(col, type(data[col]), data[col].shape)
        col_type = data[col].dtype

        if ((col_type != object) & (col_type != '<M8[ns]') & (col_type.name != 'category')):#
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                data[col] = convert_col_to_proper_int(data[col])
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else: data[col] = data[col].astype('category')

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return data

def gentle_reduce_mem_usage(data, verbose = True):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))

    for col in data.columns:
#         print(col, type(data[col]), data[col].shape)
        col_type = data[col].dtype

        if ((col_type != object) & (col_type != '<M8[ns]') & (col_type.name != 'category')):#
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                data[col] = convert_col_to_proper_int(data[col])
            else:
                if (data[col] % 1 == 0).all():
                    data[col] = convert_col_to_proper_int(data[col].astype(np.int64))
                else:
                    data[col] = convert_col_to_proper_float(data[col])
        else: data[col] = data[col].astype('category')

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return data

## Cleaning dataset

In [12]:
def corr_feature_with_target(feature, target):
    c0 = feature[target == 0].dropna()
    c1 = feature[target == 1].dropna()
        
    if set(feature.unique()) == set([0, 1]):
        diff = abs(np.mean(c0, axis=0) - np.mean(c1, axis=0))
    else:
        diff = abs(np.median(c0, axis=0) - np.median(c1, axis=0))
        
    p = ranksums(c0, c1)[1] if ((len(c0) >= 20) & (len(c1) >= 20)) else 2
        
    return [diff, p]

In [13]:
def clean_data(data):
    '''
    cleans original input data (modifies source dataframe)
    '''
    warnings.simplefilter(action = 'ignore')
    
    # Removing empty features
    nun = data.nunique()
    empty = list(nun[nun <= 1].index)
    
    data.drop(empty, axis = 1, inplace = True)
    print('After removing empty features there are {0:d} features'.format(data.shape[1]))
    
    # Removing features with the same distribution on 0 and 1 classes
    corr = pd.DataFrame(index = ['diff', 'p'])
    ind = data[data['TARGET'].notnull()].index
    
    for c in data.columns.drop('TARGET'):
#         corr[c] = corr_feature_with_target(data.loc[ind, c], data.loc[ind, 'TARGET'])
        corr[c] = corr_feature_with_target(data[c][ind], data['TARGET'][ind])

    corr = corr.T
    corr['diff_norm'] = abs(corr['diff'] / data.mean(axis = 0))
    
    to_del_1 = corr[((corr['diff'] == 0) & (corr['p'] > .05))].index
    to_del_2 = corr[((corr['diff_norm'] < .5) & (corr['p'] > .05))].drop(to_del_1).index
    to_del = list(to_del_1) + list(to_del_2)
    if 'SK_ID_CURR' in to_del:
        to_del.remove('SK_ID_CURR')
        
    data.drop(to_del, axis = 1, inplace = True)
    print('After removing features with the same distribution on 0 and 1 classes there are {0:d} features'.format(data.shape[1]))
    
    # Removing features with not the same distribution on train and test datasets
    corr_test = pd.DataFrame(index = ['diff', 'p'])
    target = data['TARGET'].notnull().astype(int)
    
    for c in data.columns.drop('TARGET'):
        corr_test[c] = corr_feature_with_target(data[c], target)

    corr_test = corr_test.T
    corr_test['diff_norm'] = abs(corr_test['diff'] / data.mean(axis = 0))
    
    bad_features = corr_test[((corr_test['p'] < .05) & (corr_test['diff_norm'] > 1))].index
    bad_features = corr.loc[bad_features][corr['diff_norm'] == 0].index
    
    data.drop(bad_features, axis = 1, inplace = True)
    print('After removing features with not the same distribution on train and test datasets there are {0:d} features'.format(data.shape[1]))
    
    del corr, corr_test
    gc.collect()
    
    # Removing features not interesting for classifier
    clf = LGBMClassifier(random_state = 0)
    train_index = data[data['TARGET'].notnull()].index
    train_columns = data.drop('TARGET', axis = 1).columns

    score = 1
    new_columns = []
    while score > .78:
        train_columns = train_columns.drop(new_columns)
        clf.fit(data.loc[train_index, train_columns], data.loc[train_index, 'TARGET'])
        f_imp = pd.Series(clf.feature_importances_, index = train_columns)
        score = roc_auc_score(data.loc[train_index, 'TARGET'], 
                              clf.predict_proba(data.loc[train_index, train_columns])[:, 1])
        new_columns = f_imp[f_imp > 0].index

    data.drop(train_columns, axis = 1, inplace = True)
    print('After removing features not interesting for classifier there are {0:d} features'.format(data.shape[1]))

    return data

In [14]:
# %%time
# df = pd.read_pickle(f'{file_path}aggregated_df2_cleaned.pkl.zip')
# df.info()

In [15]:
# %%time
# feats = [f for f in df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
# X = df[df['TARGET'].notnull()][feats]
# y = df[df['TARGET'].notnull()]['TARGET']
# del df
# gc.collect()

In [16]:
# print(X.shape, y.shape)
# gc.collect()

In [17]:
# %%time
# from boostaroota import BoostARoota
# br = BoostARoota(metric='auc', iters = 6)
# br.fit(X, y)
# pd.DataFrame(list(br.keep_vars_)).sort_values(by=0).to_csv(f'boostaroota_{len(list(br.keep_vars_))}_keep_vars.csv',
#                                                            index = False, header = False)

In [18]:
# remaining_vars = list(br.keep_vars_)
# print(len(remaining_vars))

In [19]:
# [f for f in df.columns if f in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

In [20]:
# ['SK_ID_CURR', 'TARGET'] + remaining_vars

In [21]:
# df = df[['SK_ID_CURR', 'TARGET'] + remaining_vars]

In [22]:
# df.to_pickle(f'{file_path}aggregated_df2_boostarooted314.pkl.zip')

In [23]:
# # df[remaining_vars].select_dtypes(include=[np.int16, np.uint32, np.uint8]).nunique(axis=0, dropna = False).sort_value()
# categorical_feats = df[remaining_vars].nunique(axis=0, dropna = False)\
#                     .loc[df[remaining_vars].nunique(axis=0, dropna = False)<100].index.tolist()

In [24]:
# df[categorical_feats]= df[categorical_feats].astype('category')

In [25]:
# %%time
# df = pd.read_pickle(f'{file_path}aggregated_df2_boostarooted314.pkl.zip')
# df.info()

In [26]:
# df.info()

In [28]:
prev_paid = pd.read_pickle(f'{file_path}prev_paid_series.pkl.zip')
prev_paid.head(10)

SK_ID_CURR
100001    0.000000
100002    0.000000
100003    0.000000
100004    0.000000
100005    0.000000
100006    0.000000
100007    0.000000
100008    0.083333
100009    0.000000
100010    0.000000
Name: PAID, dtype: float64

In [29]:
len(prev_paid)

338857

In [28]:
%%time
iv_extended_feat = pd.read_pickle(f'{file_path}iv_extended_feat.pkl.zip')
iv_extended_feat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356244 entries, 0 to 356243
Columns: 287 entries, ('amax', 'AMT_CREDIT_MAX_OVERDUE', 'Active') to app most popular AMT_GOODS_PRICE
dtypes: float16(49), float32(133), float64(51), int16(18), int32(8), int8(6), uint16(1), uint32(1), uint8(20)
memory usage: 386.6 MB
Wall time: 3.37 s


In [29]:
iv_extended_feat_imp_feat = pd.read_csv(f'{file_path}leonid23(iv_ext)_fi_sorted.csv').iloc[:,0].tolist()
iv_extended_feat = iv_extended_feat[['SK_ID_CURR'] + iv_extended_feat_imp_feat]
iv_extended_feat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356244 entries, 0 to 356243
Columns: 286 entries, SK_ID_CURR to ('mean', 'AMT_DOWN_PAYMENT', 'XNA')
dtypes: float16(48), float32(133), float64(51), int16(18), int32(8), int8(6), uint16(1), uint32(1), uint8(20)
memory usage: 385.9 MB


In [30]:
%%time
df4 = pd.read_pickle('../data/df.pkl.gz')
# df4 = df4.join(pd.DataFrame(prev_paid), on='SK_ID_CURR')
# df4['PAID'].fillna(0, inplace = True)
df4 = gentle_reduce_mem_usage(df4, verbose = False)
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356251 entries, 0 to 356254
Columns: 721 entries, index to CC_COUNT
dtypes: float16(289), float32(135), float64(139), int16(2), int8(1), uint32(2), uint8(153)
memory usage: 816.8 MB
Wall time: 55.9 s


In [31]:
df4_important_features = pd.read_csv(f'{file_path}leonid22_fi_sorted.csv').iloc[:,0].tolist()
df4_important_features = [f for f in df4_important_features if f not in ['TARGET','SK_ID_BUREAU','SK_ID_PREV',
                                                                 'index', 'PAID'] + iv_extended_feat_imp_feat]
df4 = df4[['SK_ID_CURR'] + df4_important_features]
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356251 entries, 0 to 356254
Columns: 495 entries, SK_ID_CURR to BURO_CREDIT_CURRENCY_currency 1_MEAN
dtypes: float16(174), float32(114), float64(132), uint32(1), uint8(74)
memory usage: 661.1 MB


In [32]:
%%time
df3 = pd.read_pickle(f'{file_path}data_7_house_leak_paid.pkl.zip')
df3_important_features = pd.read_csv(f'{file_path}leonid15_fi_sorted.csv').iloc[:,0].tolist()
df3_important_features = [f for f in df3_important_features if f not in ['TARGET','SK_ID_BUREAU','SK_ID_PREV',
                                                                 'index', 'PAID'] + iv_extended_feat_imp_feat\
                                                                  + df4.columns.tolist()]
df3 = df3[['SK_ID_CURR', 'TARGET'] + df3_important_features]
df3.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356255 entries, 0 to 356254
Columns: 857 entries, SK_ID_CURR to CLOSED_DPD_1 / Month_MIN_STD
dtypes: float16(462), float32(302), float64(27), uint32(1), uint8(65)
memory usage: 821.2 MB
Wall time: 9.05 s


In [33]:
%%time
df2 = pd.read_pickle(f'{file_path}aggregated_df2_boostarooted314.pkl.zip')
df2_important_features = [f for f in df2.columns.tolist() if f not in ['TARGET','SK_ID_BUREAU','SK_ID_PREV',
                                                                 'index', 'PAID'] + iv_extended_feat_imp_feat\
                                                                  + df4.columns.tolist() + df3.columns.tolist()]
df2 = df2[['SK_ID_CURR'] + df2_important_features]
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356244 entries, 0 to 356254
Data columns (total 44 columns):
SK_ID_CURR                                                       356244 non-null uint32
app AMT_INCOME_TOTAL / CNT_FAM_MEMBERS                           356244 non-null float32
app DAYS_LAST_PHONE_CHANGE / DAYS_EMPLOYED                       356243 non-null float32
CODE_GENDER_0                                                    356244 non-null uint8
REGION_RATING_CLIENT_W_CITY_1                                    356244 non-null uint8
BURO_DPD_0 / Month_MIN_MEAN                                      130695 non-null float64
BURO_STATUS_1_MEAN_MIN                                           134542 non-null float16
ACTIVE_DAYS_CREDIT_SIZE                                          251810 non-null float16
PREV_AMT_ANNUITY_SIZE                                            338851 non-null float16
PREV_NFLAG_INSURED_ON_APPROVAL_-inf_MEAN                         338851 non-null float16
APPR

In [34]:
%%time
df_extended = df3.merge(iv_extended_feat, on='SK_ID_CURR', how='left')
df_extended = df_extended.merge(df4, on='SK_ID_CURR', how='left')
df_extended = df_extended.merge(df2, on='SK_ID_CURR', how='left')
df_extended.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 1679 entries, SK_ID_CURR to CARD_card AMT_BALANCE - AMT_RECIVABLE_VAR
dtypes: float16(698), float32(557), float64(358), uint32(1), uint8(65)
memory usage: 2.2 GB
Wall time: 5min 2s


In [35]:
df_extended[df_extended['TARGET'].isnull()].shape

(48744, 1679)

In [36]:
strong_features = pd.read_csv(f'{file_path}leonid24(df_ext01)_fi_strong.csv').iloc[:,0].tolist()
len(strong_features)

500

In [39]:
%%time
df_extended = df_extended[['SK_ID_CURR', 'TARGET'] + strong_features]
df_extended = gentle_reduce_mem_usage(df_extended, verbose = False)
df_extended.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 502 entries, SK_ID_CURR to ACTIVE_DAYS_CREDIT_ENDDATE_SUM
dtypes: float16(182), float32(230), float64(89), uint32(1)
memory usage: 682.2 MB
Wall time: 48.6 s


In [29]:
%%time
df2 = pd.read_pickle(f'{file_path}aggregated_df2_boostarooted314.pkl.zip')
df3 = pd.read_pickle(f'{file_path}data_7_house_leak_paid.pkl.zip')
df4 = pd.read_pickle('../data/df.pkl.gz')
iv_extended_feat = pd.read_pickle(f'{file_path}iv_extended_feat.pkl.zip')

Wall time: 23.9 s


In [30]:
processed_features = df2.columns.tolist() + df3.columns.tolist() + df4.columns.tolist()\
                    + iv_extended_feat.columns.tolist()

In [58]:
len(processed_features)

2619

In [31]:
del df2, df3, df4, iv_extended_feat
gc.collect()

0

In [32]:
%%time
aggregated_df2 = pd.read_pickle(f'{file_path}aggregated_df2.pkl.zip')
processed_features = list(set(processed_features) | set(aggregated_df2.columns))
del aggregated_df2
gc.collect()


Wall time: 27.4 s


In [63]:
len(processed_features)

4241

In [33]:
%%time
tearth_cat6 = pd.read_csv('../features/tEarth_cat6.csv')
tearth_cat6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356244 entries, 0 to 356243
Columns: 758 entries, AMT_ANNUITY to CARD_SK_DPD_SUM
dtypes: float64(734), int64(24)
memory usage: 2.0 GB
Wall time: 51.3 s


In [34]:
%%time
# tearth_cat6_imp_f = [f for f in tearth_cat6.columns.tolist() if f not in ['TARGET','SK_ID_BUREAU','SK_ID_PREV',
#                                                                  'index', 'PAID'] + processed_features + USELESS_COLUMNS\
#                                                              + [x for x in tearth_cat6.columns.tolist() if 'SK_ID_'  in x]  ]
processed_features = list(set(processed_features) | set(tearth_cat6.columns.tolist()))
# tearth_cat6 = tearth_cat6[['SK_ID_CURR'] + tearth_cat6_imp_f]
# tearth_cat6 = gentle_reduce_mem_usage(tearth_cat6, verbose = False)
# tearth_cat6.to_pickle(f'{file_path}tearth_cat6.pkl.zip')
# tearth_cat6.info()

Wall time: 500 µs


In [17]:
%%time
pca_cluster = pd.concat([pd.read_csv(f'{file_path}pca+cluster/iv_train_stack_feat.csv'),
                      pd.read_csv(f'{file_path}pca+cluster/iv_test_stack_feat.csv')], ignore_index=True,
                     axis=0, verify_integrity=True)
processed_features = list(set(processed_features) | set(pca_cluster.columns.tolist()))
pca_cluster.to_pickle(f'{file_path}iv_pca_cluster.pkl.zip')
pca_cluster.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356244 entries, 0 to 356243
Data columns (total 12 columns):
SK_ID_CURR    356244 non-null int64
pca_col_0     356244 non-null float64
pca_col_1     356244 non-null float64
pca_col_2     356244 non-null float64
pca_col_3     356244 non-null float64
pca_col_4     356244 non-null float64
pca_col_5     356244 non-null float64
pca_col_6     356244 non-null float64
pca_col_7     356244 non-null float64
pca_col_8     356244 non-null float64
pca_col_9     356244 non-null float64
cluster       356244 non-null int64
dtypes: float64(10), int64(2)
memory usage: 32.6 MB
Wall time: 2.98 s


In [None]:
%%time
iv_extended_v2_feat_train = pd.read_csv(f'{file_path}iv ext feat v.2/iv_train_383feat.csv')
iv_extended_v2_feat_target = pd.read_csv(f'{file_path}iv ext feat v.2/iv_target.csv',
                      header = None, names = ['SK_ID_CURR', 'TARGET'])
iv_extended_v2_feat_test = pd.read_csv(f'{file_path}iv ext feat v.2/iv_test_383feat.csv')
iv_extended_v2_feat_train = iv_extended_v2_feat_train.merge(iv_extended_v2_feat_target,
                                                     on='SK_ID_CURR')
iv_extended_v2_feat = pd.concat([iv_extended_v2_feat_train, iv_extended_v2_feat_test],
                             ignore_index = True, axis=0, verify_integrity = True)
# iv_extended_v2_feat = gentle_reduce_mem_usage(iv_extended_v2_feat, verbose = False)
del iv_extended_v2_feat_train, iv_extended_v2_feat_target, iv_extended_v2_feat_test
gc.collect()
# iv_extended_v2_feat.to_pickle(f'{file_path}iv_extended_v2_feat_full.pkl.zip')
# iv_extended_v2_feat.info()

In [None]:
# iv_extended_v2_imp_f = [f for f in iv_extended_v2_feat.columns.tolist() if f not in ['TARGET','SK_ID_BUREAU','SK_ID_PREV',
#                                                                  'index', 'PAID'] + processed_features + USELESS_COLUMNS\
#                                                       + [x for x in iv_extended_v2_feat.columns.tolist() if 'SK_ID_'  in x]  ]
processed_features = list(set(processed_features) | set(iv_extended_v2_feat.columns.tolist()))
# iv_extended_v2_feat = iv_extended_v2_feat[['SK_ID_CURR'] + iv_extended_v2_imp_f]
# len(iv_extended_v2_imp_f)

In [None]:
pd.DataFrame({'processed_features': processed_features}).to_pickle(f'{file_path}processed_features_v2.pkl.zip')

In [14]:
processed_features = pd.read_pickle(f'{file_path}processed_features_v2.pkl.zip')['processed_features'].tolist()

In [86]:
%%time
df_extended = df_extended.merge(tearth_cat6, on='SK_ID_CURR', how='left')
df_extended = df_extended.merge(pca_cluster, on='SK_ID_CURR', how='left')
df_extended = df_extended.merge(iv_extended_v2_feat, on='SK_ID_CURR', how='left')
df_extended.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 667 entries, SK_ID_CURR to nrm_app most popular AMT_GOODS_PRICE
dtypes: float16(222), float32(255), float64(189), uint32(1)
memory usage: 1015.2 MB
Wall time: 1min 23s


In [88]:
remove_ext = [x for x in df_extended.columns.tolist() if 'EXT_SOURCE'  in x]

In [90]:
df_extended.drop(remove_ext, axis=1, inplace=True)
df_extended.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 640 entries, SK_ID_CURR to nrm_app most popular AMT_GOODS_PRICE
dtypes: float16(210), float32(253), float64(176), uint32(1)
memory usage: 969.0 MB


In [91]:
df_extended[df_extended['TARGET'].isnull()].shape

(48744, 640)

In [92]:
df_extended = gentle_reduce_mem_usage(df_extended, verbose = False)
df_extended.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 640 entries, SK_ID_CURR to nrm_app most popular AMT_GOODS_PRICE
dtypes: float16(215), float32(253), float64(171), uint32(1)
memory usage: 958.8 MB


In [93]:
df_extended.to_pickle(f'{file_path}df_extended_v2.pkl.zip')

In [87]:
%%time
gp123 = pd.read_pickle(f'{file_path}gp123.pkl')
gp123.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356251 entries, 0 to 356250
Columns: 1537 entries, SK_ID_CURR to 1535
dtypes: float64(1536), int64(1)
memory usage: 4.1 GB
Wall time: 5.92 s


In [None]:
df_extended.to_pickle(f'{file_path}df_extended.pkl.zip')

In [None]:
%%time
df_extended = pd.read_pickle(f'{file_path}df_extended.pkl.zip')

In [None]:
df_extended2 = df_extended.join(pd.DataFrame(prev_paid), on='SK_ID_CURR')
df_extended2['PAID'].fillna(0, inplace = True)