In [27]:
# 0. Load libraries #

import numpy as np
import pandas as pd
import os, time, warnings, random, gc, pickle, optuna
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, precision_recall_curve
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')

# Load custom pre-processing functions:

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()


def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)"""
    # set df_pred to None if it does not exist
    if not ((cat_fill=='mode') and (num_fill=='median')):
        print ('Imputation method not Implemented yet!')
        return None
    
    df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
    df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
    df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    if (df_pred is not None):
        df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())
        df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_train[num_features+cat_features].count
    
    all_good = (
    (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
    (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()) and 
    (np.prod(df_pred[num_features+cat_features].shape) == df_pred[num_features+cat_features].count().sum()))
    if (all_good):
        print('Missing values imputed successfully')
    else:
        print('There are still some missing values...')
    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])"""
    # set df_pred to None if it does not exist
    for feature_name in features:
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
   

def discretize_mp_i1(df_train, df_test, df_pred, feature, ntiles, delete_feature=False):
    """This function divides a continuous feature into quantile groups.
    Example: discretize_mp_i1(X_train, X_test, X_pred, 'Age', 15)"""
    # set df_pred to None if it does not exist
    _,bin = pd.qcut(df_train[feature], ntiles, retbins = True, labels = False, duplicates = 'drop')
    df_train[feature+'Ntile'] = pd.cut(df_train[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    df_test[feature+'Ntile'] = pd.cut(df_test[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    if (df_pred is not None):
        df_pred[feature+'Ntile'] = pd.cut(df_pred[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    if (delete_feature==True):
        df_train.drop(columns=[feature], inplace=True)
        df_test.drop(columns=[feature], inplace=True)
        df_pred.drop(columns=[feature], inplace=True)
    print('Discretized ',feature, ' into ', len(bin)-1, ' bins')


def log_transformer_mp_i1(df_train, df_test, df_pred, feature_subset=False, min_skew=3):
    """This function divides a continuous feature into quantile groups.
    Example: log_transformer_mp_i1(X_train, X_test, X_pred, feature_subset=num_cols)"""
    # set df_pred to None if it does not exist
    if (feature_subset==False):
        features_totransform = df_train.columns
    else:
        features_totransform = feature_subset.copy()
    skewed_vars = list(df_train.skew()[abs(df_train.skew())>min_skew].index)
    for col in list(set(skewed_vars)&set(features_totransform)):
        df_train[col] = np.log1p(df_train[col])
        df_test[col] = np.log1p(df_test[col])
        if (df_pred is not None):
            df_pred[col] = np.log1p(df_pred[col])
    print('Skewed columns log-transformed: ', list(set(skewed_vars)&set(features_totransform)))
    
    


In [14]:
time0 = time.time()
with open('../input/amex-default-downsampled-01/amex_default_0.1sample.pickle', 'rb') as pickled_one:
    df = pickle.load(pickled_one)
display(df.head(),df.shape)

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,D_64,D_65,B_16,B_17,B_18,B_19,D_66,B_20,D_68,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,D_73,P_4,D_74,D_75,D_76,B_24,R_7,D_77,B_25,B_26,D_78,D_79,R_8,R_9,S_16,D_80,R_10,R_11,B_27,D_81,D_82,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_29,B_30,S_18,D_86,D_87,R_17,R_18,D_88,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_106,D_107,B_36,B_37,R_26,R_27,B_38,D_108,D_109,D_110,D_111,B_39,D_112,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,2017-05-30,0.937438,0.003936,0.003352,0.814304,0.002594,,0.000545,0.003142,,0.051788,0.006299,0.056203,0.294726,0.006012,0.004684,0.516281,0.463092,0.263574,,0.048533,0.042849,0.006816,0.076369,0.004431,0.006775,0.005824,0.192826,0.577143,0.044728,,0.00328,0.007864,1.000973,1.008453,0.000671,,0.13955,0.003945,0.177336,0.169218,0.008859,0.009552,0.331046,,0.005952,0.292875,0.002996,0.174586,0.005551,0.288992,0.081457,CO,O,0.009775,0.008577,,0.645052,0.001056,1.0,0.001964,6.0,0.19399,0.000306,0.001171,0.000274,0.006808,0.005755,0.259391,0.015849,0.00157,0.507748,0.025547,,0.00137,0.150138,0.137811,,0.009684,0.004231,,0.005078,0.00355,0.003895,0.000551,0.008775,,0.003207,0.004396,0.004527,0.008564,0.006419,0.007386,,0.000966,1.009854,0.058119,0.003344,0.007251,0.003969,0.008143,0.005722,0.009559,,0.0,0.004709,0.001287,,0.006435,0.008478,,1,0.00574,0.005207,0.005587,0.000917,0.002636,0.001219,1.008942,0.000116,0.0026,0.001327,0.005468,0.006224,2.9e-05,0.003084,5.4e-05,0.005109,0.002801,0.301518,0.140075,0.082386,0.977059,0.001901,0.413177,1.000086,0.958687,0.661508,,1.003331,0.006081,0.009483,,1.005789,2.0,,0.000268,,,,1.00941,0.165166,,0.001087,1.0,0.425165,0.0,4.0,0.418822,0.419459,0.0,0.547555,0.43812,0.003239,0.187171,0.006194,1.0,0.009093,1.005538,1.000016,0.00345,,1.005729,0.001219,,0.002394,0.004556,,,,,,0.008103,0.006132,0.003267,,0.008814,0.001211,0.001624
1,00013181a0c5fc8f1ea38cd2b90fe8ad2fa8cad9d9f13e...,2017-06-16,0.471242,0.001547,0.000233,0.816437,0.003437,,0.005852,0.005877,,0.20844,0.382538,0.313276,0.054264,0.002035,0.002264,0.95178,0.32547,0.740619,,0.00779,1.047744,1.005298,0.076422,0.000206,0.005905,0.009041,0.148017,0.493458,0.039864,,0.006881,0.003204,1.005171,1.001148,0.007656,,0.010886,0.00733,0.448186,0.126476,0.00481,0.003998,0.619877,,0.003674,0.375356,0.001085,0.599261,0.004723,0.285493,0.017829,CO,O,0.001949,0.005428,,0.532836,0.001264,,0.002074,5.0,0.184956,0.004232,0.004607,0.000368,0.00262,0.003322,1.008265,0.034615,0.001517,0.500245,0.995367,,0.001463,0.365847,0.338917,,0.007694,0.004741,,0.004701,0.009776,0.004254,0.001066,0.003238,0.171472,0.003671,0.409818,0.00182,0.001104,0.008046,0.001091,,0.001583,1.004534,0.211125,0.002501,0.007636,0.009532,0.004129,0.001284,0.008175,,0.0,0.003752,0.004509,,0.001339,0.008494,,1,0.005238,0.008327,0.002736,0.007807,0.005138,0.009324,1.006314,0.005657,0.006198,0.006972,0.007,0.001464,0.001995,0.004677,0.009206,0.008958,0.008415,0.302657,0.134319,0.086714,0.972725,8e-05,0.00358,0.008142,0.003662,,,0.008267,0.00091,0.002263,0.08133,0.015774,1.0,,0.005904,,,,1.002032,1.113697,,0.004361,1.0,0.044666,0.0,4.0,0.045775,0.044385,0.0,0.755079,0.291698,0.00917,0.455286,0.006972,1.0,0.008171,1.004957,1.009854,0.000533,,1.002896,0.006603,,0.007623,0.006064,,,,,,0.009825,0.005849,0.00335,,0.006323,0.007118,0.00742
2,00013181a0c5fc8f1ea38cd2b90fe8ad2fa8cad9d9f13e...,2017-08-17,0.467701,0.006205,0.005869,0.819957,0.000378,,0.008037,0.008558,,0.148911,0.627657,0.48063,0.063674,2.8e-05,0.005044,0.935256,0.327515,0.901241,,0.0101,1.164417,1.001536,0.081456,0.003159,0.006622,0.108383,0.149677,0.50362,0.018164,,0.003259,0.006402,1.007938,1.002968,0.000769,,0.007096,0.008619,0.62274,0.12972,0.00829,0.001826,0.2936,,0.006495,0.425143,0.002992,0.815612,0.002106,0.289433,0.035034,CO,O,0.001823,0.001336,,0.479464,0.006855,,0.008589,5.0,0.189972,0.006763,0.009873,0.001513,0.00273,0.000633,0.257676,0.013854,0.006144,0.50629,1.15084,,0.006235,0.362119,0.340423,,0.006347,0.008762,,0.006534,0.000402,0.009662,0.008106,0.003236,0.173019,0.007788,0.604919,0.007499,0.004737,0.007295,0.004215,,6.6e-05,1.006996,0.317591,0.008555,0.006911,0.003373,0.009086,0.001949,0.006862,,0.0,0.007397,0.000639,,0.008764,0.009613,,1,0.00588,0.008371,0.007068,0.004779,0.009163,0.006826,1.00943,0.001587,0.006292,0.002701,0.007758,0.009594,0.00971,0.007728,0.007752,0.00543,0.005929,0.301553,0.137537,0.078743,0.970239,0.007011,0.004957,0.009385,0.003309,,,0.001168,0.007991,0.007811,0.080626,0.020086,1.0,,0.008882,,,,1.004104,1.147685,,0.003374,1.0,0.046637,0.0,4.0,0.041137,0.045305,0.0,0.760963,0.286112,0.001132,0.460282,0.009217,1.0,0.004836,1.006152,1.007872,0.002559,,1.002708,0.007124,,0.004873,0.006783,,,,,,0.005949,0.008454,0.004268,,0.004756,0.009932,0.004188
3,0002d381bdd8048d76719042cf1eb63caf53b636f8aacd...,2017-11-21,1.004737,0.007491,0.011268,0.815235,0.004707,0.167983,0.007022,0.005782,,,0.004346,0.000623,0.204098,0.006022,0.008137,,0.419005,0.016205,,0.16476,0.030058,0.001657,0.071683,0.340903,0.009491,0.008493,0.123615,0.669208,0.293235,,0.002898,0.009279,1.007817,1.006192,0.003584,0.116267,0.012557,0.330734,0.023306,0.142516,0.007849,0.003358,0.002332,,0.002678,0.336137,0.046614,0.018364,0.009088,0.444328,0.284889,CO,O,0.002262,0.090021,,1.009554,0.004585,,0.00054,6.0,0.188458,0.00402,0.008547,0.003696,0.006651,0.007346,0.001627,0.00959,0.000481,0.2044,0.008539,,0.001955,0.00115,0.00968,,0.007706,0.004342,0.263201,0.008383,0.000833,0.00147,0.005853,0.000318,,0.004364,0.004569,0.000893,0.000827,0.007878,0.009827,,0.002171,1.006954,0.006199,0.007014,0.004212,0.008848,0.009341,0.002519,0.000597,,0.0,0.003691,0.007256,,0.00392,0.002669,,1,0.002969,0.009031,0.008911,0.009604,0.001946,0.005293,1.009686,0.006844,0.00904,0.005778,0.005616,0.005936,0.00559,0.009827,0.009982,0.007121,0.0026,0.783929,0.134354,0.804484,0.969548,0.009972,0.545109,1.00123,0.976875,0.423101,,0.339787,0.004525,0.005884,,1.004259,1.0,,0.00166,,,,1.002119,0.064525,,0.005334,1.0,0.354861,0.0,4.0,0.348603,0.345898,0.0,0.718976,0.723254,0.001202,0.554524,0.000523,1.0,0.003754,1.00552,1.009376,0.002034,,0.007847,0.007497,,0.004805,0.003351,,,,,,0.004527,0.009969,0.007597,,0.000389,0.002958,0.006377
4,000473eb907b57c8c23f652bba40f87fe7261273dda470...,2017-08-08,0.600104,0.031315,0.69896,0.026414,0.003807,0.15198,0.005612,0.982836,,0.205391,0.252669,0.200987,0.048019,0.019929,0.002747,0.459173,0.442493,0.759969,,0.000829,0.787099,1.002522,,0.001657,0.636616,0.201164,0.051407,0.827741,0.006361,,0.019792,0.64646,0.008648,1.002048,0.002824,0.113118,0.007629,0.175061,0.689852,,0.014891,0.004405,0.597304,,0.160664,0.357412,0.332458,0.899766,0.007412,0.44081,0.022751,CO,U,0.006996,1.009787,0.994517,0.148648,1.004696,1.0,1.0082,3.0,0.185788,0.004269,0.282491,0.005111,0.001428,0.500662,0.505127,0.006231,0.006744,0.505679,0.74539,,0.006123,0.428581,0.471355,,0.005054,0.00415,,0.283092,0.004833,0.00251,0.006491,0.005071,,0.003844,0.003226,0.008886,0.008856,0.004379,0.006307,,0.002352,1.006771,0.214892,0.004464,0.00552,0.001858,0.003523,0.004248,0.000714,0.00195,1.0,0.005569,0.008498,,0.002063,0.007088,,1,0.00318,0.006332,0.001579,0.008772,0.008577,0.000815,0.008622,0.007164,0.002716,0.0039,0.00158,0.001656,0.003019,1.9e-05,0.003675,0.000234,0.000346,0.953654,1.141669,0.954401,0.19045,0.006645,0.453317,1.001948,0.986592,0.523158,,0.336799,0.000254,0.703397,,1.008359,6.0,,0.00786,,,,1.009013,0.588145,0.00018,0.204834,0.0,0.143651,0.0,-1.0,0.062348,0.064641,0.0,0.291003,0.292169,0.008677,0.280909,0.004211,1.0,0.005739,0.000269,0.008488,0.009117,,0.005012,0.004104,,0.009369,0.003991,,,,,,1.005589,0.00654,0.873327,0.043301,1.00659,0.009968,0.100346


(54849, 190)

In [15]:
df_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
display(len(df_labels))
df = pd.merge(df, df_labels, on='customer_ID', how='left')
display(df.shape, df.head())
df.drop(columns=['customer_ID', 'S_2'], inplace=True)

458913

(54849, 191)

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,D_64,D_65,B_16,B_17,B_18,B_19,D_66,B_20,D_68,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,D_73,P_4,D_74,D_75,D_76,B_24,R_7,D_77,B_25,B_26,D_78,D_79,R_8,R_9,S_16,D_80,R_10,R_11,B_27,D_81,D_82,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_29,B_30,S_18,D_86,D_87,R_17,R_18,D_88,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_106,D_107,B_36,B_37,R_26,R_27,B_38,D_108,D_109,D_110,D_111,B_39,D_112,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,2017-05-30,0.937438,0.003936,0.003352,0.814304,0.002594,,0.000545,0.003142,,0.051788,0.006299,0.056203,0.294726,0.006012,0.004684,0.516281,0.463092,0.263574,,0.048533,0.042849,0.006816,0.076369,0.004431,0.006775,0.005824,0.192826,0.577143,0.044728,,0.00328,0.007864,1.000973,1.008453,0.000671,,0.13955,0.003945,0.177336,0.169218,0.008859,0.009552,0.331046,,0.005952,0.292875,0.002996,0.174586,0.005551,0.288992,0.081457,CO,O,0.009775,0.008577,,0.645052,0.001056,1.0,0.001964,6.0,0.19399,0.000306,0.001171,0.000274,0.006808,0.005755,0.259391,0.015849,0.00157,0.507748,0.025547,,0.00137,0.150138,0.137811,,0.009684,0.004231,,0.005078,0.00355,0.003895,0.000551,0.008775,,0.003207,0.004396,0.004527,0.008564,0.006419,0.007386,,0.000966,1.009854,0.058119,0.003344,0.007251,0.003969,0.008143,0.005722,0.009559,,0.0,0.004709,0.001287,,0.006435,0.008478,,1,0.00574,0.005207,0.005587,0.000917,0.002636,0.001219,1.008942,0.000116,0.0026,0.001327,0.005468,0.006224,2.9e-05,0.003084,5.4e-05,0.005109,0.002801,0.301518,0.140075,0.082386,0.977059,0.001901,0.413177,1.000086,0.958687,0.661508,,1.003331,0.006081,0.009483,,1.005789,2.0,,0.000268,,,,1.00941,0.165166,,0.001087,1.0,0.425165,0.0,4.0,0.418822,0.419459,0.0,0.547555,0.43812,0.003239,0.187171,0.006194,1.0,0.009093,1.005538,1.000016,0.00345,,1.005729,0.001219,,0.002394,0.004556,,,,,,0.008103,0.006132,0.003267,,0.008814,0.001211,0.001624,0
1,00013181a0c5fc8f1ea38cd2b90fe8ad2fa8cad9d9f13e...,2017-06-16,0.471242,0.001547,0.000233,0.816437,0.003437,,0.005852,0.005877,,0.20844,0.382538,0.313276,0.054264,0.002035,0.002264,0.95178,0.32547,0.740619,,0.00779,1.047744,1.005298,0.076422,0.000206,0.005905,0.009041,0.148017,0.493458,0.039864,,0.006881,0.003204,1.005171,1.001148,0.007656,,0.010886,0.00733,0.448186,0.126476,0.00481,0.003998,0.619877,,0.003674,0.375356,0.001085,0.599261,0.004723,0.285493,0.017829,CO,O,0.001949,0.005428,,0.532836,0.001264,,0.002074,5.0,0.184956,0.004232,0.004607,0.000368,0.00262,0.003322,1.008265,0.034615,0.001517,0.500245,0.995367,,0.001463,0.365847,0.338917,,0.007694,0.004741,,0.004701,0.009776,0.004254,0.001066,0.003238,0.171472,0.003671,0.409818,0.00182,0.001104,0.008046,0.001091,,0.001583,1.004534,0.211125,0.002501,0.007636,0.009532,0.004129,0.001284,0.008175,,0.0,0.003752,0.004509,,0.001339,0.008494,,1,0.005238,0.008327,0.002736,0.007807,0.005138,0.009324,1.006314,0.005657,0.006198,0.006972,0.007,0.001464,0.001995,0.004677,0.009206,0.008958,0.008415,0.302657,0.134319,0.086714,0.972725,8e-05,0.00358,0.008142,0.003662,,,0.008267,0.00091,0.002263,0.08133,0.015774,1.0,,0.005904,,,,1.002032,1.113697,,0.004361,1.0,0.044666,0.0,4.0,0.045775,0.044385,0.0,0.755079,0.291698,0.00917,0.455286,0.006972,1.0,0.008171,1.004957,1.009854,0.000533,,1.002896,0.006603,,0.007623,0.006064,,,,,,0.009825,0.005849,0.00335,,0.006323,0.007118,0.00742,1
2,00013181a0c5fc8f1ea38cd2b90fe8ad2fa8cad9d9f13e...,2017-08-17,0.467701,0.006205,0.005869,0.819957,0.000378,,0.008037,0.008558,,0.148911,0.627657,0.48063,0.063674,2.8e-05,0.005044,0.935256,0.327515,0.901241,,0.0101,1.164417,1.001536,0.081456,0.003159,0.006622,0.108383,0.149677,0.50362,0.018164,,0.003259,0.006402,1.007938,1.002968,0.000769,,0.007096,0.008619,0.62274,0.12972,0.00829,0.001826,0.2936,,0.006495,0.425143,0.002992,0.815612,0.002106,0.289433,0.035034,CO,O,0.001823,0.001336,,0.479464,0.006855,,0.008589,5.0,0.189972,0.006763,0.009873,0.001513,0.00273,0.000633,0.257676,0.013854,0.006144,0.50629,1.15084,,0.006235,0.362119,0.340423,,0.006347,0.008762,,0.006534,0.000402,0.009662,0.008106,0.003236,0.173019,0.007788,0.604919,0.007499,0.004737,0.007295,0.004215,,6.6e-05,1.006996,0.317591,0.008555,0.006911,0.003373,0.009086,0.001949,0.006862,,0.0,0.007397,0.000639,,0.008764,0.009613,,1,0.00588,0.008371,0.007068,0.004779,0.009163,0.006826,1.00943,0.001587,0.006292,0.002701,0.007758,0.009594,0.00971,0.007728,0.007752,0.00543,0.005929,0.301553,0.137537,0.078743,0.970239,0.007011,0.004957,0.009385,0.003309,,,0.001168,0.007991,0.007811,0.080626,0.020086,1.0,,0.008882,,,,1.004104,1.147685,,0.003374,1.0,0.046637,0.0,4.0,0.041137,0.045305,0.0,0.760963,0.286112,0.001132,0.460282,0.009217,1.0,0.004836,1.006152,1.007872,0.002559,,1.002708,0.007124,,0.004873,0.006783,,,,,,0.005949,0.008454,0.004268,,0.004756,0.009932,0.004188,1
3,0002d381bdd8048d76719042cf1eb63caf53b636f8aacd...,2017-11-21,1.004737,0.007491,0.011268,0.815235,0.004707,0.167983,0.007022,0.005782,,,0.004346,0.000623,0.204098,0.006022,0.008137,,0.419005,0.016205,,0.16476,0.030058,0.001657,0.071683,0.340903,0.009491,0.008493,0.123615,0.669208,0.293235,,0.002898,0.009279,1.007817,1.006192,0.003584,0.116267,0.012557,0.330734,0.023306,0.142516,0.007849,0.003358,0.002332,,0.002678,0.336137,0.046614,0.018364,0.009088,0.444328,0.284889,CO,O,0.002262,0.090021,,1.009554,0.004585,,0.00054,6.0,0.188458,0.00402,0.008547,0.003696,0.006651,0.007346,0.001627,0.00959,0.000481,0.2044,0.008539,,0.001955,0.00115,0.00968,,0.007706,0.004342,0.263201,0.008383,0.000833,0.00147,0.005853,0.000318,,0.004364,0.004569,0.000893,0.000827,0.007878,0.009827,,0.002171,1.006954,0.006199,0.007014,0.004212,0.008848,0.009341,0.002519,0.000597,,0.0,0.003691,0.007256,,0.00392,0.002669,,1,0.002969,0.009031,0.008911,0.009604,0.001946,0.005293,1.009686,0.006844,0.00904,0.005778,0.005616,0.005936,0.00559,0.009827,0.009982,0.007121,0.0026,0.783929,0.134354,0.804484,0.969548,0.009972,0.545109,1.00123,0.976875,0.423101,,0.339787,0.004525,0.005884,,1.004259,1.0,,0.00166,,,,1.002119,0.064525,,0.005334,1.0,0.354861,0.0,4.0,0.348603,0.345898,0.0,0.718976,0.723254,0.001202,0.554524,0.000523,1.0,0.003754,1.00552,1.009376,0.002034,,0.007847,0.007497,,0.004805,0.003351,,,,,,0.004527,0.009969,0.007597,,0.000389,0.002958,0.006377,0
4,000473eb907b57c8c23f652bba40f87fe7261273dda470...,2017-08-08,0.600104,0.031315,0.69896,0.026414,0.003807,0.15198,0.005612,0.982836,,0.205391,0.252669,0.200987,0.048019,0.019929,0.002747,0.459173,0.442493,0.759969,,0.000829,0.787099,1.002522,,0.001657,0.636616,0.201164,0.051407,0.827741,0.006361,,0.019792,0.64646,0.008648,1.002048,0.002824,0.113118,0.007629,0.175061,0.689852,,0.014891,0.004405,0.597304,,0.160664,0.357412,0.332458,0.899766,0.007412,0.44081,0.022751,CO,U,0.006996,1.009787,0.994517,0.148648,1.004696,1.0,1.0082,3.0,0.185788,0.004269,0.282491,0.005111,0.001428,0.500662,0.505127,0.006231,0.006744,0.505679,0.74539,,0.006123,0.428581,0.471355,,0.005054,0.00415,,0.283092,0.004833,0.00251,0.006491,0.005071,,0.003844,0.003226,0.008886,0.008856,0.004379,0.006307,,0.002352,1.006771,0.214892,0.004464,0.00552,0.001858,0.003523,0.004248,0.000714,0.00195,1.0,0.005569,0.008498,,0.002063,0.007088,,1,0.00318,0.006332,0.001579,0.008772,0.008577,0.000815,0.008622,0.007164,0.002716,0.0039,0.00158,0.001656,0.003019,1.9e-05,0.003675,0.000234,0.000346,0.953654,1.141669,0.954401,0.19045,0.006645,0.453317,1.001948,0.986592,0.523158,,0.336799,0.000254,0.703397,,1.008359,6.0,,0.00786,,,,1.009013,0.588145,0.00018,0.204834,0.0,0.143651,0.0,-1.0,0.062348,0.064641,0.0,0.291003,0.292169,0.008677,0.280909,0.004211,1.0,0.005739,0.000269,0.008488,0.009117,,0.005012,0.004104,,0.009369,0.003991,,,,,,1.005589,0.00654,0.873327,0.043301,1.00659,0.009968,0.100346,1


In [16]:
miss_c = df.count()
miss_feat = miss_c[miss_c<40000]
df = df[list(set(df.columns)-set(miss_feat.index))]
display(df.shape)
df[['D_114','D_120','D_68','B_30','D_117','D_116','B_38']] = \
df[['D_114','D_120','D_68','B_30','D_117','D_116','B_38']].apply(pd.to_numeric)
df['D_64'] = df['D_64'].astype('string')
display(df.dtypes)

(54849, 157)

B_9       float64
S_18      float64
D_103     float64
D_131     float64
S_20      float64
D_121     float64
R_18      float64
S_3       float64
D_81      float64
R_2       float64
B_19      float64
P_3       float64
D_79      float64
D_63       object
D_55      float64
D_86      float64
D_52      float64
B_8       float64
R_8       float64
B_4       float64
R_3       float64
B_38      float64
B_6       float64
B_28      float64
D_48      float64
R_24      float64
B_23      float64
D_130     float64
D_62      float64
D_68      float64
D_70      float64
D_128     float64
R_25      float64
S_23      float64
D_83      float64
B_13      float64
B_12      float64
D_122     float64
B_11      float64
S_13      float64
B_15      float64
R_19      float64
D_89      float64
D_107     float64
D_109     float64
B_10      float64
S_11      float64
D_41      float64
D_59      float64
R_7       float64
R_21      float64
B_41      float64
D_104     float64
S_24      float64
D_124     float64
D_51      

In [17]:
### sample split ###

test_size = 0.1
df.reset_index(inplace=True, drop=True)
test_index = random.sample(list(df.index), int(test_size*df.shape[0]))
train = df.iloc[list(set(df.index)-set(test_index))]
test = df.iloc[test_index]
display(train.shape, test.shape, train.head(3), test.head(3))
display(train.dtypes, test.dtypes)


(49365, 157)

(5484, 157)

Unnamed: 0,B_9,S_18,D_103,D_131,S_20,D_121,R_18,S_3,D_81,R_2,B_19,P_3,D_79,D_63,D_55,D_86,D_52,B_8,R_8,B_4,R_3,B_38,B_6,B_28,D_48,R_24,B_23,D_130,D_62,D_68,D_70,D_128,R_25,S_23,D_83,B_13,B_12,D_122,B_11,S_13,B_15,R_19,D_89,D_107,D_109,B_10,S_11,D_41,D_59,R_7,R_21,B_41,D_104,S_24,D_124,D_51,D_75,R_14,D_54,D_143,D_112,R_10,B_24,D_119,R_22,B_20,P_2,B_26,S_19,S_27,R_5,S_22,B_37,R_27,B_36,D_114,S_5,S_25,S_16,D_96,S_12,R_1,B_25,D_118,R_15,B_5,D_116,D_145,B_16,B_22,B_27,D_65,D_115,D_94,D_60,D_125,D_64,S_15,D_144,B_30,R_12,D_141,D_61,R_16,S_17,D_93,D_39,D_133,B_40,S_6,R_13,D_91,D_117,D_80,D_127,B_18,D_123,R_4,B_21,B_33,D_120,R_20,D_129,D_47,B_7,R_17,D_84,D_140,D_139,B_14,target,D_113,D_45,R_6,B_3,B_32,B_2,B_31,D_78,D_69,S_7,D_74,R_11,D_126,D_92,R_23,D_72,P_4,B_1,S_8,D_46,D_102,D_44,D_71,S_26,R_28,D_58
0,0.006775,0.004709,1.000086,0.001219,0.000917,0.547555,0.008478,,0.007386,0.004684,0.001056,0.577143,0.000551,CO,0.177336,0.001287,0.192826,0.006816,0.008775,0.056203,0.005824,2.0,0.048533,0.058119,0.263574,5.4e-05,0.025547,1.005729,0.081457,6.0,0.259391,1.005538,0.005109,0.140075,0.007251,0.008859,0.13955,0.43812,0.007864,0.001171,0.005551,0.005207,0.000116,1.003331,0.000268,0.044728,0.288992,0.000545,0.292875,0.004231,0.001219,0.00345,0.958687,0.082386,0.187171,0.004431,0.137811,0.003969,1.008453,0.008814,1.00941,0.004527,0.009684,0.419459,0.0026,0.001964,0.937438,0.00355,0.00574,,0.009552,0.301518,0.009483,1.005789,0.006081,1.0,0.00328,0.977059,0.003207,0.002801,0.19399,0.002594,0.005078,0.418822,0.008143,0.006012,0.0,0.001624,0.008577,0.005755,0.006419,0.009775,0.425165,0.003084,0.002996,0.006194,O,0.507748,0.001211,0.0,1.009854,0.003267,0.174586,0.009559,0.000966,2.9e-05,0.003936,0.002394,0.165166,1.000973,0.003344,0.005468,4.0,0.004396,0.009093,0.645052,0.003239,0.000671,0.000274,1.008942,0.0,0.002636,1.000016,0.463092,0.042849,0.006435,0.005722,0.006132,0.008103,0.005952,0,0.001087,0.294726,0.000306,0.003142,0.005587,0.814304,1,0.003895,0.006808,,0.150138,0.008564,1.0,0.006224,0.001327,0.00157,0.00137,0.003352,0.003945,0.516281,0.413177,0.006299,0.015849,0.001901,0.004556,0.331046
1,0.005905,0.003752,0.008142,0.006603,0.007807,0.755079,0.008494,,0.001091,0.002264,0.001264,0.493458,0.001066,CO,0.448186,0.004509,0.148017,1.005298,0.003238,0.313276,0.009041,1.0,0.00779,0.211125,0.740619,0.009206,0.995367,1.002896,0.017829,5.0,1.008265,1.004957,0.008958,0.134319,0.007636,0.00481,0.010886,0.291698,0.003204,0.004607,0.004723,0.008327,0.005657,0.008267,0.005904,0.039864,0.285493,0.005852,0.375356,0.004741,0.009324,0.000533,0.003662,0.086714,0.455286,0.000206,0.338917,0.009532,1.001148,0.006323,1.002032,0.00182,0.007694,0.044385,0.006198,0.002074,0.471242,0.009776,0.005238,,0.003998,0.302657,0.002263,0.015774,0.00091,1.0,0.006881,0.972725,0.003671,0.008415,0.184956,0.003437,0.004701,0.045775,0.004129,0.002035,0.0,0.00742,0.005428,0.003322,0.008046,0.001949,0.044666,0.004677,0.001085,0.006972,O,0.500245,0.007118,0.0,1.004534,0.00335,0.599261,0.008175,0.001583,0.001995,0.001547,0.007623,1.113697,1.005171,0.002501,0.007,4.0,0.409818,0.008171,0.532836,0.00917,0.007656,0.000368,1.006314,0.0,0.005138,1.009854,0.32547,1.047744,0.001339,0.001284,0.005849,0.009825,0.003674,1,0.004361,0.054264,0.004232,0.005877,0.002736,0.816437,1,0.004254,0.00262,,0.365847,0.001104,1.0,0.001464,0.006972,0.001517,0.001463,0.000233,0.00733,0.95178,0.00358,0.382538,0.034615,8e-05,0.006064,0.619877
2,0.006622,0.007397,0.009385,0.007124,0.004779,0.760963,0.009613,,0.004215,0.005044,0.006855,0.50362,0.008106,CO,0.62274,0.000639,0.149677,1.001536,0.003236,0.48063,0.108383,1.0,0.0101,0.317591,0.901241,0.007752,1.15084,1.002708,0.035034,5.0,0.257676,1.006152,0.00543,0.137537,0.006911,0.00829,0.007096,0.286112,0.006402,0.009873,0.002106,0.008371,0.001587,0.001168,0.008882,0.018164,0.289433,0.008037,0.425143,0.008762,0.006826,0.002559,0.003309,0.078743,0.460282,0.003159,0.340423,0.003373,1.002968,0.004756,1.004104,0.007499,0.006347,0.045305,0.006292,0.008589,0.467701,0.000402,0.00588,,0.001826,0.301553,0.007811,0.020086,0.007991,1.0,0.003259,0.970239,0.007788,0.005929,0.189972,0.000378,0.006534,0.041137,0.009086,2.8e-05,0.0,0.004188,0.001336,0.000633,0.007295,0.001823,0.046637,0.007728,0.002992,0.009217,O,0.50629,0.009932,0.0,1.006996,0.004268,0.815612,0.006862,6.6e-05,0.00971,0.006205,0.004873,1.147685,1.007938,0.008555,0.007758,4.0,0.604919,0.004836,0.479464,0.001132,0.000769,0.001513,1.00943,0.0,0.009163,1.007872,0.327515,1.164417,0.008764,0.001949,0.008454,0.005949,0.006495,1,0.003374,0.063674,0.006763,0.008558,0.007068,0.819957,1,0.009662,0.00273,,0.362119,0.004737,1.0,0.009594,0.002701,0.006144,0.006235,0.005869,0.008619,0.935256,0.004957,0.627657,0.013854,0.007011,0.006783,0.2936


Unnamed: 0,B_9,S_18,D_103,D_131,S_20,D_121,R_18,S_3,D_81,R_2,B_19,P_3,D_79,D_63,D_55,D_86,D_52,B_8,R_8,B_4,R_3,B_38,B_6,B_28,D_48,R_24,B_23,D_130,D_62,D_68,D_70,D_128,R_25,S_23,D_83,B_13,B_12,D_122,B_11,S_13,B_15,R_19,D_89,D_107,D_109,B_10,S_11,D_41,D_59,R_7,R_21,B_41,D_104,S_24,D_124,D_51,D_75,R_14,D_54,D_143,D_112,R_10,B_24,D_119,R_22,B_20,P_2,B_26,S_19,S_27,R_5,S_22,B_37,R_27,B_36,D_114,S_5,S_25,S_16,D_96,S_12,R_1,B_25,D_118,R_15,B_5,D_116,D_145,B_16,B_22,B_27,D_65,D_115,D_94,D_60,D_125,D_64,S_15,D_144,B_30,R_12,D_141,D_61,R_16,S_17,D_93,D_39,D_133,B_40,S_6,R_13,D_91,D_117,D_80,D_127,B_18,D_123,R_4,B_21,B_33,D_120,R_20,D_129,D_47,B_7,R_17,D_84,D_140,D_139,B_14,target,D_113,D_45,R_6,B_3,B_32,B_2,B_31,D_78,D_69,S_7,D_74,R_11,D_126,D_92,R_23,D_72,P_4,B_1,S_8,D_46,D_102,D_44,D_71,S_26,R_28,D_58
216,1.444255,0.009417,,,0.003057,0.473626,0.006544,0.359909,0.008868,0.007943,0.259086,0.030894,0.003067,CO,0.337822,0.001228,0.016975,1.000701,0.00309,0.163279,0.209717,5.0,0.066098,0.166299,0.771333,0.005702,0.135183,,0.038942,6.0,0.007386,,0.007412,0.136268,0.001566,0.168719,0.100893,0.576253,0.169822,0.423284,0.002088,0.003227,0.002571,,0.003375,0.075979,0.368,0.004137,0.294821,0.009654,0.005276,0.009147,,0.986665,0.507456,0.004704,0.07353,0.009287,1.004854,,1.000382,0.001687,0.000435,0.0006,0.005871,0.18368,0.299391,0.001548,0.008753,0.415058,0.007849,0.982266,0.166875,1.007699,0.001042,1.0,0.582539,0.977539,0.00927,0.003315,0.494861,0.751433,0.581687,0.003201,1.003143,0.156938,0.0,,0.425529,0.000392,0.003031,0.004079,0.315349,0.001202,0.929644,8.9e-05,U,0.504322,0.002925,0.0,1.001113,,0.469507,0.006816,0.11122,0.005591,0.505173,0.008632,0.065122,0.004058,0.006003,0.004262,2.0,0.005874,0.004301,0.310086,0.007362,0.00092,0.001072,0.000151,0.0,0.005379,,0.07271,0.151678,0.009122,0.009029,0.003104,,0.326419,0,0.204278,0.039325,0.005607,0.20167,0.002532,0.186654,1,0.00035,0.002008,0.554447,0.004734,0.001756,1.0,0.004455,0.000584,0.009038,0.002059,0.166677,0.173094,0.502261,0.006682,0.251616,0.006429,0.005369,0.000797,0.007204
34664,1.169251,0.002008,1.006972,0.009043,0.007865,0.140892,0.001222,0.177112,0.001259,0.006259,0.334101,0.663608,0.006955,CO,0.464773,0.000932,0.031289,1.002856,0.00893,0.096213,0.100236,3.0,0.049421,0.031908,0.861864,0.004269,0.070308,0.004465,0.007085,5.0,0.004793,0.005361,0.005957,0.063415,0.000988,0.024044,0.008699,0.147531,0.155693,0.425198,0.004192,0.000757,0.002556,0.342849,0.004481,0.049941,0.202189,0.005216,0.459632,0.008469,0.006466,0.000655,0.996531,0.958253,0.000425,0.00456,0.140121,0.009,1.008672,0.003039,1.00516,0.00466,0.003516,0.184069,0.005463,0.118687,0.528807,0.004165,0.007795,0.574544,0.008119,0.947514,0.200391,1.007818,0.002676,0.0,0.188754,0.483392,0.00959,0.007818,0.644396,0.004689,0.085923,0.187515,0.004209,0.009423,0.0,0.005125,0.335212,0.002638,0.002801,0.00437,0.174579,0.002214,0.751463,0.004994,U,0.405355,0.009937,0.0,1.008628,0.009443,0.875893,0.000573,0.005159,0.002495,0.00336,0.009262,0.079874,0.004908,0.005713,0.001654,-1.0,0.003889,0.007115,0.246207,0.002082,0.007789,0.006686,0.008691,0.0,0.004772,0.009868,0.100071,0.100784,0.005997,0.005627,0.008163,0.000833,0.049547,0,0.007552,0.04625,0.004941,0.27912,0.007916,0.07689,1,0.001922,0.007357,0.216772,0.073471,0.005696,1.0,0.009677,0.009946,0.00707,0.003067,0.193914,0.316177,,0.945436,0.128189,0.011239,0.003324,0.002776,0.079812
36474,0.010184,0.007841,0.009112,0.003039,0.004583,0.713884,0.006655,0.092026,0.002203,0.005836,0.005627,0.615564,0.009471,CO,0.046157,0.006712,0.156647,1.007184,0.003518,0.013102,0.10283,2.0,0.223363,0.02787,0.061561,0.001024,0.008439,0.003755,0.352602,6.0,0.000832,0.006865,3.9e-05,0.135299,0.003906,0.110068,0.06803,0.435594,0.005237,0.005705,0.001212,0.001418,0.005857,0.004234,0.006361,0.298999,0.483574,0.00931,0.341502,5.1e-05,0.001566,0.005015,0.009794,0.899906,0.273166,0.002033,0.004485,0.001189,1.00751,0.006523,1.003022,0.007149,0.007656,0.402574,0.00714,0.002993,0.850812,0.005209,0.009061,0.009217,0.009452,0.905833,0.007076,1.002973,0.004911,0.0,0.004617,0.969777,0.008442,0.008341,0.193427,0.005888,0.016419,0.41177,0.005926,0.029651,0.0,0.007546,0.088448,0.009505,0.009709,0.007429,0.414522,0.002642,0.116406,0.007444,R,0.201933,0.003268,0.0,1.008811,0.005988,0.078874,0.009297,0.009755,0.005705,0.035452,0.008252,0.022123,0.003669,0.008212,0.005762,4.0,0.001646,0.002971,1.007002,0.00857,0.005526,0.00794,1.008204,0.0,0.00478,0.003365,0.422543,0.019701,0.006619,0.005362,0.008492,0.007487,0.013432,0,0.003865,0.018572,0.005037,0.005757,0.000677,1.000451,1,0.007719,0.009491,0.112452,0.005922,0.008618,1.0,0.003971,0.004547,0.004804,0.970602,0.004797,0.473436,0.473129,0.005355,0.004659,0.182165,0.061001,0.00929,0.001203


B_9       float64
S_18      float64
D_103     float64
D_131     float64
S_20      float64
D_121     float64
R_18      float64
S_3       float64
D_81      float64
R_2       float64
B_19      float64
P_3       float64
D_79      float64
D_63       object
D_55      float64
D_86      float64
D_52      float64
B_8       float64
R_8       float64
B_4       float64
R_3       float64
B_38      float64
B_6       float64
B_28      float64
D_48      float64
R_24      float64
B_23      float64
D_130     float64
D_62      float64
D_68      float64
D_70      float64
D_128     float64
R_25      float64
S_23      float64
D_83      float64
B_13      float64
B_12      float64
D_122     float64
B_11      float64
S_13      float64
B_15      float64
R_19      float64
D_89      float64
D_107     float64
D_109     float64
B_10      float64
S_11      float64
D_41      float64
D_59      float64
R_7       float64
R_21      float64
B_41      float64
D_104     float64
S_24      float64
D_124     float64
D_51      

B_9       float64
S_18      float64
D_103     float64
D_131     float64
S_20      float64
D_121     float64
R_18      float64
S_3       float64
D_81      float64
R_2       float64
B_19      float64
P_3       float64
D_79      float64
D_63       object
D_55      float64
D_86      float64
D_52      float64
B_8       float64
R_8       float64
B_4       float64
R_3       float64
B_38      float64
B_6       float64
B_28      float64
D_48      float64
R_24      float64
B_23      float64
D_130     float64
D_62      float64
D_68      float64
D_70      float64
D_128     float64
R_25      float64
S_23      float64
D_83      float64
B_13      float64
B_12      float64
D_122     float64
B_11      float64
S_13      float64
B_15      float64
R_19      float64
D_89      float64
D_107     float64
D_109     float64
B_10      float64
S_11      float64
D_41      float64
D_59      float64
R_7       float64
R_21      float64
B_41      float64
D_104     float64
S_24      float64
D_124     float64
D_51      

In [18]:
cat_feat = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']
num_feat = list(set(train.columns)-set(cat_feat)-set(['target']))
display(train[cat_feat], [train[col].value_counts() for col in train.columns if col in cat_feat])

Unnamed: 0,B_30,B_38,D_114,D_116,D_117,D_120,D_126,D_63,D_64,D_68
0,0.0,2.0,1.0,0.0,4.0,0.0,1.0,CO,O,6.0
1,0.0,1.0,1.0,0.0,4.0,0.0,1.0,CO,O,5.0
2,0.0,1.0,1.0,0.0,4.0,0.0,1.0,CO,O,5.0
3,0.0,1.0,1.0,0.0,4.0,0.0,1.0,CO,O,6.0
5,0.0,2.0,1.0,0.0,-1.0,0.0,1.0,CL,R,6.0
...,...,...,...,...,...,...,...,...,...,...
54843,1.0,5.0,0.0,0.0,2.0,0.0,1.0,CO,R,5.0
54844,0.0,2.0,1.0,0.0,4.0,1.0,1.0,CR,O,5.0
54845,0.0,3.0,1.0,0.0,-1.0,0.0,1.0,CO,U,5.0
54847,0.0,1.0,,,,,,CO,,


[CO    36903
 CR     8216
 CL     3895
 XZ      205
 XM       83
 XL       63
 Name: D_63, dtype: int64,
 2.0    17287
 3.0    11327
 1.0    10312
 5.0     3970
 4.0     2613
 7.0     2285
 6.0     1551
 Name: B_38, dtype: int64,
 6.0    24806
 5.0    10693
 3.0     4382
 4.0     4237
 2.0     1971
 1.0     1228
 0.0      139
 Name: D_68, dtype: int64,
 1.0    29519
 0.0    18273
 Name: D_114, dtype: int64,
 0.0    47729
 1.0       63
 Name: D_116, dtype: int64,
 O     25873
 U     13683
 R      7560
 -1      346
 Name: D_64, dtype: Int64,
 0.0    42045
 1.0     6837
 2.0      463
 Name: B_30, dtype: int64,
 -1.0    13055
  3.0    10435
  4.0    10192
  2.0     5882
  5.0     4001
  6.0     3103
  1.0     1124
 Name: D_117, dtype: int64,
 0.0    42194
 1.0     5598
 Name: D_120, dtype: int64,
  1.0    37944
  0.0     8120
 -1.0     2272
 Name: D_126, dtype: int64]

In [19]:
for col in cat_feat:
    if train[col].dtype in ['string', 'O', 'str']:
        print('str', col)
    else:
        print('num', col)
        

num B_30
num B_38
num D_114
num D_116
num D_117
num D_120
num D_126
str D_63
str D_64
num D_68


In [20]:
### fill missing values ###

for col in cat_feat:
    if train[col].dtype in ['string', 'O', 'str']:
        train[col].fillna(value='M', inplace=True)
        test[col].fillna(value='M', inplace=True)
    else:
        train[col].fillna(value=-1000, inplace=True)
        test[col].fillna(value=-1000, inplace=True)

for col in num_feat:
    train[col].fillna(train[col].median(), inplace=True)
    test[col].fillna(train[col].median(), inplace=True)
    
display(train.count(), test.count())

B_9       49365
S_18      49365
D_103     49365
D_131     49365
S_20      49365
D_121     49365
R_18      49365
S_3       49365
D_81      49365
R_2       49365
B_19      49365
P_3       49365
D_79      49365
D_63      49365
D_55      49365
D_86      49365
D_52      49365
B_8       49365
R_8       49365
B_4       49365
R_3       49365
B_38      49365
B_6       49365
B_28      49365
D_48      49365
R_24      49365
B_23      49365
D_130     49365
D_62      49365
D_68      49365
D_70      49365
D_128     49365
R_25      49365
S_23      49365
D_83      49365
B_13      49365
B_12      49365
D_122     49365
B_11      49365
S_13      49365
B_15      49365
R_19      49365
D_89      49365
D_107     49365
D_109     49365
B_10      49365
S_11      49365
D_41      49365
D_59      49365
R_7       49365
R_21      49365
B_41      49365
D_104     49365
S_24      49365
D_124     49365
D_51      49365
D_75      49365
R_14      49365
D_54      49365
D_143     49365
D_112     49365
R_10      49365
B_24    

B_9       5484
S_18      5484
D_103     5484
D_131     5484
S_20      5484
D_121     5484
R_18      5484
S_3       5484
D_81      5484
R_2       5484
B_19      5484
P_3       5484
D_79      5484
D_63      5484
D_55      5484
D_86      5484
D_52      5484
B_8       5484
R_8       5484
B_4       5484
R_3       5484
B_38      5484
B_6       5484
B_28      5484
D_48      5484
R_24      5484
B_23      5484
D_130     5484
D_62      5484
D_68      5484
D_70      5484
D_128     5484
R_25      5484
S_23      5484
D_83      5484
B_13      5484
B_12      5484
D_122     5484
B_11      5484
S_13      5484
B_15      5484
R_19      5484
D_89      5484
D_107     5484
D_109     5484
B_10      5484
S_11      5484
D_41      5484
D_59      5484
R_7       5484
R_21      5484
B_41      5484
D_104     5484
S_24      5484
D_124     5484
D_51      5484
D_75      5484
R_14      5484
D_54      5484
D_143     5484
D_112     5484
R_10      5484
B_24      5484
D_119     5484
R_22      5484
B_20      5484
P_2       

In [21]:
### OHE ###

X_train = train.copy()
y_train = X_train.pop('target')
X_test = test.copy()
y_test = X_test.pop('target')
display(X_test.head())
#display(X_train.nunique())

### Do OHE for some features ###

# this code uses passthrough from 
# https://stackoverflow.com/questions/54160370/how-to-use-sklearn-column-transformer

feature_transformer = ColumnTransformer([
    ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), cat_feat)],
    remainder="passthrough")

print('Number of features before transaformation: ', X_train.shape)
X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())

print('time to do feature proprocessing: ', time.time()-time0)

print('Number of features after transaformation: ', X_train.shape)

Unnamed: 0,B_9,S_18,D_103,D_131,S_20,D_121,R_18,S_3,D_81,R_2,B_19,P_3,D_79,D_63,D_55,D_86,D_52,B_8,R_8,B_4,R_3,B_38,B_6,B_28,D_48,R_24,B_23,D_130,D_62,D_68,D_70,D_128,R_25,S_23,D_83,B_13,B_12,D_122,B_11,S_13,B_15,R_19,D_89,D_107,D_109,B_10,S_11,D_41,D_59,R_7,R_21,B_41,D_104,S_24,D_124,D_51,D_75,R_14,D_54,D_143,D_112,R_10,B_24,D_119,R_22,B_20,P_2,B_26,S_19,S_27,R_5,S_22,B_37,R_27,B_36,D_114,S_5,S_25,S_16,D_96,S_12,R_1,B_25,D_118,R_15,B_5,D_116,D_145,B_16,B_22,B_27,D_65,D_115,D_94,D_60,D_125,D_64,S_15,D_144,B_30,R_12,D_141,D_61,R_16,S_17,D_93,D_39,D_133,B_40,S_6,R_13,D_91,D_117,D_80,D_127,B_18,D_123,R_4,B_21,B_33,D_120,R_20,D_129,D_47,B_7,R_17,D_84,D_140,D_139,B_14,D_113,D_45,R_6,B_3,B_32,B_2,B_31,D_78,D_69,S_7,D_74,R_11,D_126,D_92,R_23,D_72,P_4,B_1,S_8,D_46,D_102,D_44,D_71,S_26,R_28,D_58
216,1.444255,0.009417,0.0093,0.005543,0.003057,0.473626,0.006544,0.359909,0.008868,0.007943,0.259086,0.030894,0.003067,CO,0.337822,0.001228,0.016975,1.000701,0.00309,0.163279,0.209717,5.0,0.066098,0.166299,0.771333,0.005702,0.135183,0.00623,0.038942,6.0,0.007386,1.000377,0.007412,0.136268,0.001566,0.168719,0.100893,0.576253,0.169822,0.423284,0.002088,0.003227,0.002571,0.009311,0.003375,0.075979,0.368,0.004137,0.294821,0.009654,0.005276,0.009147,0.009302,0.986665,0.507456,0.004704,0.07353,0.009287,1.004854,0.006128,1.000382,0.001687,0.000435,0.0006,0.005871,0.18368,0.299391,0.001548,0.008753,0.415058,0.007849,0.982266,0.166875,1.007699,0.001042,1.0,0.582539,0.977539,0.00927,0.003315,0.494861,0.751433,0.581687,0.003201,1.003143,0.156938,0.0,0.006069,0.425529,0.000392,0.003031,0.004079,0.315349,0.001202,0.929644,8.9e-05,U,0.504322,0.002925,0.0,1.001113,0.006115,0.469507,0.006816,0.11122,0.005591,0.505173,0.008632,0.065122,0.004058,0.006003,0.004262,2.0,0.005874,0.004301,0.310086,0.007362,0.00092,0.001072,0.000151,0.0,0.005379,0.008785,0.07271,0.151678,0.009122,0.009029,0.003104,0.006073,0.326419,0.204278,0.039325,0.005607,0.20167,0.002532,0.186654,1,0.00035,0.002008,0.554447,0.004734,0.001756,1.0,0.004455,0.000584,0.009038,0.002059,0.166677,0.173094,0.502261,0.006682,0.251616,0.006429,0.005369,0.000797,0.007204
34664,1.169251,0.002008,1.006972,0.009043,0.007865,0.140892,0.001222,0.177112,0.001259,0.006259,0.334101,0.663608,0.006955,CO,0.464773,0.000932,0.031289,1.002856,0.00893,0.096213,0.100236,3.0,0.049421,0.031908,0.861864,0.004269,0.070308,0.004465,0.007085,5.0,0.004793,0.005361,0.005957,0.063415,0.000988,0.024044,0.008699,0.147531,0.155693,0.425198,0.004192,0.000757,0.002556,0.342849,0.004481,0.049941,0.202189,0.005216,0.459632,0.008469,0.006466,0.000655,0.996531,0.958253,0.000425,0.00456,0.140121,0.009,1.008672,0.003039,1.00516,0.00466,0.003516,0.184069,0.005463,0.118687,0.528807,0.004165,0.007795,0.574544,0.008119,0.947514,0.200391,1.007818,0.002676,0.0,0.188754,0.483392,0.00959,0.007818,0.644396,0.004689,0.085923,0.187515,0.004209,0.009423,0.0,0.005125,0.335212,0.002638,0.002801,0.00437,0.174579,0.002214,0.751463,0.004994,U,0.405355,0.009937,0.0,1.008628,0.009443,0.875893,0.000573,0.005159,0.002495,0.00336,0.009262,0.079874,0.004908,0.005713,0.001654,-1.0,0.003889,0.007115,0.246207,0.002082,0.007789,0.006686,0.008691,0.0,0.004772,0.009868,0.100071,0.100784,0.005997,0.005627,0.008163,0.000833,0.049547,0.007552,0.04625,0.004941,0.27912,0.007916,0.07689,1,0.001922,0.007357,0.216772,0.073471,0.005696,1.0,0.009677,0.009946,0.00707,0.003067,0.193914,0.316177,0.460129,0.945436,0.128189,0.011239,0.003324,0.002776,0.079812
36474,0.010184,0.007841,0.009112,0.003039,0.004583,0.713884,0.006655,0.092026,0.002203,0.005836,0.005627,0.615564,0.009471,CO,0.046157,0.006712,0.156647,1.007184,0.003518,0.013102,0.10283,2.0,0.223363,0.02787,0.061561,0.001024,0.008439,0.003755,0.352602,6.0,0.000832,0.006865,3.9e-05,0.135299,0.003906,0.110068,0.06803,0.435594,0.005237,0.005705,0.001212,0.001418,0.005857,0.004234,0.006361,0.298999,0.483574,0.00931,0.341502,5.1e-05,0.001566,0.005015,0.009794,0.899906,0.273166,0.002033,0.004485,0.001189,1.00751,0.006523,1.003022,0.007149,0.007656,0.402574,0.00714,0.002993,0.850812,0.005209,0.009061,0.009217,0.009452,0.905833,0.007076,1.002973,0.004911,0.0,0.004617,0.969777,0.008442,0.008341,0.193427,0.005888,0.016419,0.41177,0.005926,0.029651,0.0,0.007546,0.088448,0.009505,0.009709,0.007429,0.414522,0.002642,0.116406,0.007444,R,0.201933,0.003268,0.0,1.008811,0.005988,0.078874,0.009297,0.009755,0.005705,0.035452,0.008252,0.022123,0.003669,0.008212,0.005762,4.0,0.001646,0.002971,1.007002,0.00857,0.005526,0.00794,1.008204,0.0,0.00478,0.003365,0.422543,0.019701,0.006619,0.005362,0.008492,0.007487,0.013432,0.003865,0.018572,0.005037,0.005757,0.000677,1.000451,1,0.007719,0.009491,0.112452,0.005922,0.008618,1.0,0.003971,0.004547,0.004804,0.970602,0.004797,0.473436,0.473129,0.005355,0.004659,0.182165,0.061001,0.00929,0.001203
46599,0.288772,0.009478,1.000557,0.007277,0.0088,0.221624,0.003488,0.12675,0.003372,0.001444,0.005218,0.726268,0.008596,CO,0.391363,0.008639,0.238714,0.005946,0.000918,0.278196,0.100581,5.0,0.031886,0.26755,0.282979,0.003227,0.235845,0.004825,0.024206,3.0,0.253581,0.002417,0.001635,0.138209,0.000305,0.048443,0.019098,0.148457,0.306186,0.696115,0.004036,0.00958,0.005528,0.33925,0.000774,0.075209,0.809264,0.00703,0.528809,0.00268,0.007244,0.00154,0.983375,0.971696,0.275631,0.005918,0.400703,0.000893,1.004924,0.007949,1.001596,0.009647,0.006217,0.031192,0.008089,0.592878,0.681778,0.000723,0.006479,0.457766,0.00746,0.960399,0.344641,1.003984,0.006941,0.0,0.020884,0.97182,0.002923,0.002425,0.185168,0.008282,0.207088,0.032525,0.003256,0.039125,0.0,0.002536,0.837838,0.004357,0.000335,0.003493,0.034184,0.005661,0.19899,0.006545,U,0.205354,0.007682,0.0,1.002828,0.004264,0.509727,0.00823,0.0044,0.008496,0.005965,0.008179,0.148552,0.004337,0.003082,0.006238,2.0,0.407747,0.004666,0.097918,0.006558,0.008113,0.008934,0.003357,0.0,0.001231,0.001639,0.195367,0.245165,0.007838,0.002833,0.000111,0.004343,0.122363,0.807255,0.265456,0.007332,0.121636,0.009562,0.080022,1,0.008125,0.007679,0.092168,0.364091,0.006157,0.0,0.002634,0.008275,0.00322,0.007626,0.342176,0.702514,0.303347,0.693125,0.004686,0.012893,0.117818,0.002208,0.604158
5305,0.003179,0.003633,1.004772,0.007819,0.0004,0.767615,0.00752,0.108771,0.003169,0.001707,0.009968,0.657097,0.001128,CO,0.020905,0.003272,0.133451,0.002677,0.003456,0.009391,0.00995,2.0,0.294421,0.003061,0.005256,0.001477,0.007558,1.009273,0.093258,5.0,0.007096,1.003024,0.001814,0.139183,0.009961,0.066041,0.088832,0.433618,0.005969,0.001052,0.004383,0.00776,0.004721,0.339577,0.003371,0.304112,0.48758,0.287075,0.425267,0.008976,0.007637,0.007759,0.965811,0.930973,0.281481,0.341345,0.005065,0.001546,1.005772,0.005875,0.00138,6.5e-05,0.002314,0.071693,0.000194,0.008831,0.779958,0.007751,0.002247,0.241786,0.001706,0.928614,0.003027,1.009683,0.009019,1.0,0.001355,0.971615,0.000956,0.006342,0.19419,0.007355,0.014456,0.069616,0.005632,0.009468,0.0,0.009149,0.088276,0.007933,0.005072,0.002511,0.073226,0.008723,1.002684,0.007882,O,0.308386,0.005001,0.0,1.00175,0.008799,0.00609,0.009668,0.003043,0.006287,0.118265,0.004562,0.004755,0.004878,0.001225,0.000109,4.0,0.009656,0.007238,1.007288,0.003887,0.00404,0.004493,1.005025,0.0,0.001486,0.008815,0.656147,0.008079,4.1e-05,0.001967,0.004339,0.00049,0.012259,0.209717,0.335829,0.004764,0.015013,0.004181,1.005768,1,0.000194,0.005637,0.072944,0.00468,0.008553,1.0,0.009935,0.009829,0.008695,0.000443,0.009423,0.330017,0.460129,0.435757,0.008524,0.137839,0.087369,0.00688,0.002792


Number of features before transaformation:  (49365, 156)
time to do feature proprocessing:  3.254011631011963
Number of features after transaformation:  (49365, 198)


In [24]:
# Modeling #

xgb = XGBClassifier(tree_method = 'gpu_hist', n_estimators=500, eta=0.03)
xgb.fit(X_train, y_train)

display('Accuracy: ', accuracy_score(y_train,xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,xgb.predict(X_train)))
display('ROC score: ', roc_auc_score(y_train,xgb.predict(X_train)))

# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,xgb.predict(X_test)))
display('ROC score: ', roc_auc_score(y_test,xgb.predict(X_test)))

'Accuracy: '

0.9318545528208245

'F1 score: '

0.8641905530884134

'Recall score: '

0.8601623402716386

'Precision score: '

0.8682566723452584

'Accuracy: '

0.8736323851203501

'F1 score: '

0.7443747694577647

'Recall score: '

0.7306299782766111

'Precision score: '

0.7586466165413533

In [None]:
### Fit XGBoost using Optuna hyperparameter optimization ###

def objective(trial, n_splits=2, n_jobs=-1, scale_pos_weight=1, early_stopping_rounds=50):

    cv_regularizer=0.0
    # Usually values between 0.1 and 0.2 work fine.

    params = {
        "tree_method": 'gpu_hist',
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_uniform("learning_rate", 0.01, 0.2),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 0.95),
        "subsample": trial.suggest_uniform("subsample", 0.5, 0.95),
        "alpha": trial.suggest_loguniform("alpha", 0.1, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 0.1, 150.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-10, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 10),
        "n_jobs": n_jobs,
    }

    X = X_train
    y = y_train

    model = XGBClassifier(**params)
    rkf = KFold(n_splits=n_splits, shuffle=True)
    X_values = X.values
    y_values = y.values
    y_pred = np.zeros_like(y_values)
    y_pred_train = np.zeros_like(y_values)
    for train_index, test_index in rkf.split(X_values):
        X_A, X_B = X_values[train_index, :], X_values[test_index, :]
        y_A, y_B = y_values[train_index], y_values[test_index]
        model.fit(X_A, y_A, eval_set=[(X_B, y_B)],
                  early_stopping_rounds=early_stopping_rounds, verbose = False)
        y_pred[test_index] += model.predict(X_B)
        y_pred_train[train_index] += model.predict(X_A)
    score_train = roc_auc_score(y_train, y_pred_train)
    score_test = roc_auc_score(y_train, y_pred) 
    overfit = score_train-score_test
    #return (f1_score_test)
    return (score_test-cv_regularizer*overfit)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
print('Total time for hypermarameter optimization ', time.time()-time1)
hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")

optuna_hyperpars = study.best_params
optuna_hyperpars['tree_method']='gpu_hist'
optuna_hyperpars['scale_pos_weight']=1
optuna_hyperpars['early_stopping_rounds']=50

optuna_xgb = XGBClassifier(**optuna_hyperpars)
optuna_xgb.fit(X_train, y_train)

y_score = optuna_xgb.predict_proba(X_test)[:, 1]
sample_weight = np.ones((len(y_score),))
sample_weight[(y_test == 0).values] = (1 * 1.0)/downsampling_factor

# pr in train set
sample_w_train = np.ones((len(y_train),))
sample_w_train[(y_train == 0).values] = (1 * 1.0)/downsampling_factor
precision_t, recall_t, threshold = precision_recall_curve(y_train, optuna_xgb.predict_proba(X_train)[:, 1], 
                                                      sample_weight=sample_w_train)
auc_precision_recall_train = auc(recall_t, precision_t)
temp = recall_t[(recall_t>0.195)&(recall_t<0.205)]
temp = temp[int(len(temp)/2)]
indexx = ((np.where(recall_t==temp)))[0][0]
r20prec_train = precision_t[indexx]

fig, ax = plt.subplots()
ax.plot(recall_t, precision_t, color='purple')
ax.set_title('Precision-Recall Curve, train')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
ax.set_ylim(bottom=0, top=1.02)
plt.show()

# pr in test set
precision, recall, threshold = precision_recall_curve(y_test, y_score, sample_weight=sample_weight)
auc_precision_recall_test = auc(recall, precision)
temp = recall[(recall>0.195)&(recall<0.205)]
temp = temp[int(len(temp)/2)]
indexx = ((np.where(recall==temp)))[0][0]
r20prec_test = precision[indexx]

fig, ax = plt.subplots()
ax.plot(recall, precision, color='purple')
ax.set_title('Precision-Recall Curve, test')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
ax.set_ylim(bottom=0, top=1.02)
plt.show()


display('Accuracy: ', accuracy_score(y_train,optuna_xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,optuna_xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,optuna_xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,optuna_xgb.predict(X_train)))
display('PRUC: ', auc_precision_recall_train)
display('Precision at 20% recall: ', r20prec_train)
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,optuna_xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,optuna_xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,optuna_xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,optuna_xgb.predict(X_test)))
display('PRUC: ', auc_precision_recall_test)
display('Precision at 20% recall: ', r20prec_test)

print('Total time for the whole script: ', time.time()-time0, '\n',
     'Time for the modeling part: ', time.time()-time1)

[32m[I 2022-07-31 15:27:29,119][0m A new study created in memory with name: no-name-743a0375-b6da-435d-baa5-d5b4039d7a7c[0m
[32m[I 2022-07-31 15:27:37,106][0m Trial 0 finished with value: 0.820123464857722 and parameters: {'n_estimators': 252, 'max_depth': 8, 'learning_rate': 0.022025550949664414, 'colsample_bytree': 0.6352962125075249, 'subsample': 0.54285832154442, 'alpha': 0.37813697173440647, 'lambda': 111.76833812232796, 'gamma': 0.0026117254605560847, 'min_child_weight': 5.455410056059996}. Best is trial 0 with value: 0.820123464857722.[0m
[32m[I 2022-07-31 15:27:44,161][0m Trial 1 finished with value: 0.8181249338650463 and parameters: {'n_estimators': 448, 'max_depth': 8, 'learning_rate': 0.06158610783401419, 'colsample_bytree': 0.4458815676016047, 'subsample': 0.8971481567676587, 'alpha': 1.8215109222437262, 'lambda': 0.373624558243115, 'gamma': 4.0465098647727424e-05, 'min_child_weight': 0.4651043118799652}. Best is trial 0 with value: 0.820123464857722.[0m
[32m[I 2