Outline:
1. Load evth.
2. Preclean data.
3. Train-test split.
4. Missing values.
5. FE.
6. Modeling.
7. FI.
8. predictions.

In [60]:
import pandas as pd
import numpy as np
import seaborn as sns
import os, time, warnings, gzip, gc, random, math, shap, pickle, optuna
from IPython.display import display
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.inspection import permutation_importance
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_rows',200)

warnings.filterwarnings("ignore")

# target encoding code:

### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded


In [61]:
def TargetEncoderMP(train_set, test_set, feature_cols, target_col, M=5):
    
    """ This function implements terget encoding on train-test split with 2-fold CV
    It relies on previously defined CrossFoldEncoder, stolen from somehwere on SO.
    m parameter controls smoothing and is defaulted at 5 """
    
    encoder = CrossFoldEncoder(MEstimateEncoder, m=M)
    train_encoded = encoder.fit_transform(train_set, train_set[target_col], cols=feature_cols)
    test_encoded = encoder.transform(test_set)

    train_set.drop(columns=feature_cols, inplace=True)
    test_set.drop(columns=feature_cols, inplace=True)
    train_set = pd.concat([train_set, train_encoded], axis = 1)
    test_set = pd.concat([test_set, test_encoded], axis = 1)
    
    return ([train_set, test_set])
    

In [62]:
# 1. Load data #

train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv',
                   skiprows=lambda i: i>0 and random.random() > 0.2)
pred = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
pred0 = pred.copy()
display(train.shape, train.head())

display(train.target.value_counts())
num_cols = [col for col in train.columns if train[col].nunique()>10]
num_cols.remove('f_27')
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
print('num_cols: ', num_cols, '\n', 'cat_cols: ', cat_cols)
display(train[num_cols].describe())
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
display([train[col].value_counts() for col in cat_cols])

#[train[col].value_counts() for col in train.columns if train[col].nunique()<10]
#display(train.count())

(90046, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,9,0.829142,-0.082872,-0.206667,-0.032502,0.366289,0.277981,-1.113467,2,0,2,1,1,1,2,0,3,1,1,3,2.16708,-2.325547,-0.346484,-3.117106,-3.609328,2.677487,-1.7726,-3.053609,BABCBBBABD,-42.976312,0,0,1
1,13,0.594253,1.705987,-0.803039,0.707572,-1.144791,0.706629,-0.685819,3,3,4,2,2,4,3,5,3,0,3,6,1.86164,2.476877,0.091103,0.347511,-0.808147,-3.170256,2.013181,1.755256,ADBCDAGKBE,4.132834,1,1,1
2,24,-0.683852,0.845307,0.35133,1.968716,0.02062,0.739591,-0.168113,2,2,4,2,3,2,2,2,3,0,1,1,-0.777539,1.333049,-3.606945,0.198199,-1.249348,0.500795,5.002314,-1.320283,ACBCBAFNBB,227.811313,1,2,0
3,33,0.453691,-1.091397,1.332791,2.475178,-1.078069,-1.505157,-0.00067,3,1,5,1,0,3,2,2,0,3,0,0,-1.213272,-5.425635,1.089327,3.956885,-3.389517,1.995535,-3.5087,2.552841,ACAADAGACD,180.842117,0,0,1
4,45,-0.188189,-0.435209,-0.979375,0.533677,1.201778,-0.231102,-0.025276,4,6,4,0,0,2,6,0,0,6,0,2,-1.71267,0.008876,-0.306836,-1.932916,-1.987434,-0.4179,1.521264,-2.39386,BBACBADDFE,-170.381225,0,0,0


0    46132
1    43914
Name: target, dtype: int64

num_cols:  ['id', 'f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28'] 
 cat_cols:  ['f_30', 'f_29', 'f_27']


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28
count,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0,90046.0
mean,450059.336728,5.1e-05,0.002615,-0.001237,-0.00885,0.001637,7.6e-05,-0.006123,2.029452,2.062646,2.365369,2.17581,1.805577,2.843302,2.242054,1.513926,2.103969,2.089876,1.863203,2.067232,0.32126,-0.177006,-0.143168,-0.01595,-0.369915,-0.351176,0.164752,0.353684,-0.397165
std,259520.421747,1.001096,0.995143,0.997304,1.000211,1.002343,0.997162,1.002101,1.655317,1.593868,1.631426,1.647043,1.534888,1.763667,1.540402,1.358377,1.569217,1.557999,1.468604,1.559201,2.31849,2.393477,2.482895,2.455228,2.449154,2.39313,2.407054,2.472582,238.445555
min,9.0,-4.39816,-4.221837,-3.968571,-4.500519,-4.150859,-4.576952,-4.842919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-10.912201,-11.146797,-11.534975,-10.757361,-11.26696,-10.610983,-10.464727,-11.686965,-1229.753052
25%,225152.25,-0.681498,-0.670821,-0.674459,-0.683769,-0.678683,-0.668464,-0.679646,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,-1.218204,-1.797447,-1.801222,-1.651421,-2.017576,-1.965763,-1.447571,-1.263992,-159.555185
50%,450437.5,0.002536,0.001837,-0.001207,-0.013132,-0.001725,0.001724,-0.00704,2.0,2.0,2.0,2.0,2.0,3.0,2.0,1.0,2.0,2.0,2.0,2.0,0.338452,-0.190761,-0.14146,0.029728,-0.38378,-0.343949,0.147258,0.401234,-0.169922
75%,674425.5,0.677367,0.671533,0.675827,0.667667,0.677222,0.673633,0.671671,3.0,3.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,1.889833,1.435899,1.524702,1.659125,1.253832,1.269039,1.775231,2.011899,157.690981
max,899989.0,4.523902,4.702502,4.961982,4.45492,3.927101,4.022673,4.338643,13.0,16.0,12.0,13.0,12.0,14.0,12.0,11.0,14.0,12.0,11.0,12.0,10.554207,9.948537,12.029242,11.34408,12.07022,9.270373,11.246201,10.932414,1092.295038


[2    30259
 0    30170
 1    29617
 Name: f_30, dtype: int64,
 0    58959
 1    31087
 Name: f_29, dtype: int64,
 BBBBCBCEBC    4
 ACBBCABMDE    3
 BCBBBBDMCB    3
 BBABBADJDD    3
 ABBBBADNBC    3
              ..
 BCAAAAECBD    1
 BABCBABMEE    1
 ABBEABESEA    1
 ACBCBADKDB    1
 BABACADRAC    1
 Name: f_27, Length: 87888, dtype: int64]

In [63]:
# 3. split data #

#train_test_split approach does not work when I use TE.

test_size = 0.1
train.reset_index(inplace=True, drop=True)
test_index = random.sample(list(train.index), int(test_size*train.shape[0]))
train_ = train.iloc[list(set(train.index)-set(test_index))]
test = train.iloc[test_index]
display(train.shape, test.shape, train.head(3), test.head(3))

train0, test0 = train.copy(), test.copy()

(90046, 33)

(9004, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,9,0.829142,-0.082872,-0.206667,-0.032502,0.366289,0.277981,-1.113467,2,0,2,1,1,1,2,0,3,1,1,3,2.16708,-2.325547,-0.346484,-3.117106,-3.609328,2.677487,-1.7726,-3.053609,BABCBBBABD,-42.976312,0,0,1
1,13,0.594253,1.705987,-0.803039,0.707572,-1.144791,0.706629,-0.685819,3,3,4,2,2,4,3,5,3,0,3,6,1.86164,2.476877,0.091103,0.347511,-0.808147,-3.170256,2.013181,1.755256,ADBCDAGKBE,4.132834,1,1,1
2,24,-0.683852,0.845307,0.35133,1.968716,0.02062,0.739591,-0.168113,2,2,4,2,3,2,2,2,3,0,1,1,-0.777539,1.333049,-3.606945,0.198199,-1.249348,0.500795,5.002314,-1.320283,ACBCBAFNBB,227.811313,1,2,0


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
62321,622560,0.208312,-0.866936,0.623831,-1.10716,0.62046,0.067716,1.33368,3,5,2,4,0,6,6,4,0,2,0,2,3.131442,-0.97839,-3.526296,3.380634,1.102245,2.480529,-0.203223,-3.078405,BBAAEADPBA,281.346418,0,1,1
57935,578589,-1.344102,0.301971,-0.635332,0.551898,1.467359,0.760738,0.750802,2,1,4,1,4,2,0,0,1,0,1,3,1.114053,-0.20775,-1.582796,-0.735319,1.263769,1.969539,1.525228,-0.235317,ACBACABIBC,191.321809,0,0,1
38027,380948,-1.104382,1.345215,0.693597,1.125739,-1.039118,-1.778975,-0.011934,3,1,1,3,3,2,1,1,1,2,3,6,-1.692098,6.55669,3.402469,-1.450112,0.157936,-1.494861,-1.493435,5.00943,AABEBAEACC,-121.668524,1,1,1


In [64]:
# 5. FE #

# first do TE

train, test = train0.copy(), test0.copy()
# create noise target in pred set to be discarded later. needed to avoid error due to dimension mismatch.
pred['target'] = 1

display(train.head(), test.head())
_, pred = TargetEncoderMP(train, pred, ['f_27'], 'target')
train, test = TargetEncoderMP(train0, test, ['f_27'], 'target')
display(train.head(), test.head(), pred.head())

# then extract a target

X_train = train.copy()
y_train = X_train.pop('target')
X_test = test.copy()
y_test = X_test.pop('target')

# then do OHE with columntransformer

ohe_cols = ['f_29', 'f_30']
feature_transformer = ColumnTransformer([
   ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), ohe_cols)],
   remainder="passthrough")
print('Number of features before transaformation: ', X_train.shape)
X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
pred = pd.DataFrame(feature_transformer.transform(pred), columns=feature_transformer.get_feature_names_out())
display(X_train.head(), X_test.head(), pred.head())


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,9,0.829142,-0.082872,-0.206667,-0.032502,0.366289,0.277981,-1.113467,2,0,2,1,1,1,2,0,3,1,1,3,2.16708,-2.325547,-0.346484,-3.117106,-3.609328,2.677487,-1.7726,-3.053609,BABCBBBABD,-42.976312,0,0,1
1,13,0.594253,1.705987,-0.803039,0.707572,-1.144791,0.706629,-0.685819,3,3,4,2,2,4,3,5,3,0,3,6,1.86164,2.476877,0.091103,0.347511,-0.808147,-3.170256,2.013181,1.755256,ADBCDAGKBE,4.132834,1,1,1
2,24,-0.683852,0.845307,0.35133,1.968716,0.02062,0.739591,-0.168113,2,2,4,2,3,2,2,2,3,0,1,1,-0.777539,1.333049,-3.606945,0.198199,-1.249348,0.500795,5.002314,-1.320283,ACBCBAFNBB,227.811313,1,2,0
3,33,0.453691,-1.091397,1.332791,2.475178,-1.078069,-1.505157,-0.00067,3,1,5,1,0,3,2,2,0,3,0,0,-1.213272,-5.425635,1.089327,3.956885,-3.389517,1.995535,-3.5087,2.552841,ACAADAGACD,180.842117,0,0,1
4,45,-0.188189,-0.435209,-0.979375,0.533677,1.201778,-0.231102,-0.025276,4,6,4,0,0,2,6,0,0,6,0,2,-1.71267,0.008876,-0.306836,-1.932916,-1.987434,-0.4179,1.521264,-2.39386,BBACBADDFE,-170.381225,0,0,0


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
62321,622560,0.208312,-0.866936,0.623831,-1.10716,0.62046,0.067716,1.33368,3,5,2,4,0,6,6,4,0,2,0,2,3.131442,-0.97839,-3.526296,3.380634,1.102245,2.480529,-0.203223,-3.078405,BBAAEADPBA,281.346418,0,1,1
57935,578589,-1.344102,0.301971,-0.635332,0.551898,1.467359,0.760738,0.750802,2,1,4,1,4,2,0,0,1,0,1,3,1.114053,-0.20775,-1.582796,-0.735319,1.263769,1.969539,1.525228,-0.235317,ACBACABIBC,191.321809,0,0,1
38027,380948,-1.104382,1.345215,0.693597,1.125739,-1.039118,-1.778975,-0.011934,3,1,1,3,3,2,1,1,1,2,3,6,-1.692098,6.55669,3.402469,-1.450112,0.157936,-1.494861,-1.493435,5.00943,AABEBAEACC,-121.668524,1,1,1
89768,897337,-1.736994,0.794152,1.222757,0.228743,-0.29956,-1.173759,0.88726,4,1,2,2,1,0,1,1,2,1,4,5,1.783402,3.499031,-2.604161,-0.314153,0.468883,2.537186,-1.553761,-1.638709,ADBDABBNCE,47.262215,1,2,1
71457,713533,0.22108,1.444455,-0.836671,-0.341606,0.231035,-0.967732,0.455914,3,3,1,4,3,5,1,0,0,0,5,6,-1.158251,0.596354,-0.575615,-1.495201,-2.138705,0.529145,0.781979,-1.645252,ACAEABDIBB,3.468672,0,1,1


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
0,9,0.829142,-0.082872,-0.206667,-0.032502,0.366289,0.277981,-1.113467,2,0,2,1,1,1,2,0,3,1,1,3,2.16708,-2.325547,-0.346484,-3.117106,-3.609328,2.677487,-1.7726,-3.053609,-42.976312,0,0,1,0.487976
1,13,0.594253,1.705987,-0.803039,0.707572,-1.144791,0.706629,-0.685819,3,3,4,2,2,4,3,5,3,0,3,6,1.86164,2.476877,0.091103,0.347511,-0.808147,-3.170256,2.013181,1.755256,4.132834,1,1,1,0.487976
2,24,-0.683852,0.845307,0.35133,1.968716,0.02062,0.739591,-0.168113,2,2,4,2,3,2,2,2,3,0,1,1,-0.777539,1.333049,-3.606945,0.198199,-1.249348,0.500795,5.002314,-1.320283,227.811313,1,2,0,0.487976
3,33,0.453691,-1.091397,1.332791,2.475178,-1.078069,-1.505157,-0.00067,3,1,5,1,0,3,2,2,0,3,0,0,-1.213272,-5.425635,1.089327,3.956885,-3.389517,1.995535,-3.5087,2.552841,180.842117,0,0,1,0.487976
4,45,-0.188189,-0.435209,-0.979375,0.533677,1.201778,-0.231102,-0.025276,4,6,4,0,0,2,6,0,0,6,0,2,-1.71267,0.008876,-0.306836,-1.932916,-1.987434,-0.4179,1.521264,-2.39386,-170.381225,0,0,0,0.487976


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
62321,622560,0.208312,-0.866936,0.623831,-1.10716,0.62046,0.067716,1.33368,3,5,2,4,0,6,6,4,0,2,0,2,3.131442,-0.97839,-3.526296,3.380634,1.102245,2.480529,-0.203223,-3.078405,281.346418,0,1,1,0.603575
57935,578589,-1.344102,0.301971,-0.635332,0.551898,1.467359,0.760738,0.750802,2,1,4,1,4,2,0,0,1,0,1,3,1.114053,-0.20775,-1.582796,-0.735319,1.263769,1.969539,1.525228,-0.235317,191.321809,0,0,1,0.551726
38027,380948,-1.104382,1.345215,0.693597,1.125739,-1.039118,-1.778975,-0.011934,3,1,1,3,3,2,1,1,1,2,3,6,-1.692098,6.55669,3.402469,-1.450112,0.157936,-1.494861,-1.493435,5.00943,-121.668524,1,1,1,0.551743
89768,897337,-1.736994,0.794152,1.222757,0.228743,-0.29956,-1.173759,0.88726,4,1,2,2,1,0,1,1,2,1,4,5,1.783402,3.499031,-2.604161,-0.314153,0.468883,2.537186,-1.553761,-1.638709,47.262215,1,2,1,0.55169
71457,713533,0.22108,1.444455,-0.836671,-0.341606,0.231035,-0.967732,0.455914,3,3,1,4,3,5,1,0,0,0,5,6,-1.158251,0.596354,-0.575615,-1.495201,-2.138705,0.529145,0.781979,-1.645252,3.468672,0,1,1,0.603543


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
0,900000,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6,6,0,0,0,5,1,1,2,2,0,1,-1.128371,2.749347,-1.0064,-1.193879,-2.435736,-2.42743,-1.966887,5.734205,99.478419,0,0,1,0.487684
1,900001,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1,3,4,0,2,1,3,0,0,0,2,0,-4.424098,1.080762,2.382405,0.149442,1.883322,-2.848714,-0.725155,3.194219,-65.993825,1,0,1,0.487684
2,900002,0.30399,2.44511,0.246515,0.818248,0.359731,-1.331845,1.358622,3,3,4,1,1,3,2,2,4,1,0,3,-1.523864,-1.406712,-7.026098,1.312277,-5.157192,1.714005,0.585032,0.066898,-87.405622,0,1,1,0.487684
3,900003,0.154053,0.260126,-1.367092,-0.093175,-1.111034,-0.948481,1.11922,0,0,4,2,1,5,0,3,3,1,4,2,-1.404597,3.011085,-0.594532,-3.939475,1.75457,-2.364007,-1.00332,3.893099,-281.29346,0,0,1,0.487684
4,900004,-1.651904,-0.424266,-0.667356,-0.322124,-0.089462,0.181705,1.784983,2,2,2,0,0,3,0,1,2,0,2,2,-1.968516,0.100594,0.084906,-0.985736,-0.130467,-3.557893,1.210687,1.861884,25.629415,0,2,1,0.487684


Number of features before transaformation:  (90046, 32)


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,1.0,0.0,0.0,9.0,0.829142,-0.082872,-0.206667,-0.032502,0.366289,0.277981,-1.113467,2.0,0.0,2.0,1.0,1.0,1.0,2.0,0.0,3.0,1.0,1.0,3.0,2.16708,-2.325547,-0.346484,-3.117106,-3.609328,2.677487,-1.7726,-3.053609,-42.976312,0.487976
1,0.0,1.0,0.0,1.0,0.0,13.0,0.594253,1.705987,-0.803039,0.707572,-1.144791,0.706629,-0.685819,3.0,3.0,4.0,2.0,2.0,4.0,3.0,5.0,3.0,0.0,3.0,6.0,1.86164,2.476877,0.091103,0.347511,-0.808147,-3.170256,2.013181,1.755256,4.132834,0.487976
2,0.0,1.0,0.0,0.0,1.0,24.0,-0.683852,0.845307,0.35133,1.968716,0.02062,0.739591,-0.168113,2.0,2.0,4.0,2.0,3.0,2.0,2.0,2.0,3.0,0.0,1.0,1.0,-0.777539,1.333049,-3.606945,0.198199,-1.249348,0.500795,5.002314,-1.320283,227.811313,0.487976
3,1.0,0.0,1.0,0.0,0.0,33.0,0.453691,-1.091397,1.332791,2.475178,-1.078069,-1.505157,-0.00067,3.0,1.0,5.0,1.0,0.0,3.0,2.0,2.0,0.0,3.0,0.0,0.0,-1.213272,-5.425635,1.089327,3.956885,-3.389517,1.995535,-3.5087,2.552841,180.842117,0.487976
4,1.0,0.0,1.0,0.0,0.0,45.0,-0.188189,-0.435209,-0.979375,0.533677,1.201778,-0.231102,-0.025276,4.0,6.0,4.0,0.0,0.0,2.0,6.0,0.0,0.0,6.0,0.0,2.0,-1.71267,0.008876,-0.306836,-1.932916,-1.987434,-0.4179,1.521264,-2.39386,-170.381225,0.487976


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,0.0,1.0,0.0,622560.0,0.208312,-0.866936,0.623831,-1.10716,0.62046,0.067716,1.33368,3.0,5.0,2.0,4.0,0.0,6.0,6.0,4.0,0.0,2.0,0.0,2.0,3.131442,-0.97839,-3.526296,3.380634,1.102245,2.480529,-0.203223,-3.078405,281.346418,0.603575
1,1.0,0.0,1.0,0.0,0.0,578589.0,-1.344102,0.301971,-0.635332,0.551898,1.467359,0.760738,0.750802,2.0,1.0,4.0,1.0,4.0,2.0,0.0,0.0,1.0,0.0,1.0,3.0,1.114053,-0.20775,-1.582796,-0.735319,1.263769,1.969539,1.525228,-0.235317,191.321809,0.551726
2,0.0,1.0,0.0,1.0,0.0,380948.0,-1.104382,1.345215,0.693597,1.125739,-1.039118,-1.778975,-0.011934,3.0,1.0,1.0,3.0,3.0,2.0,1.0,1.0,1.0,2.0,3.0,6.0,-1.692098,6.55669,3.402469,-1.450112,0.157936,-1.494861,-1.493435,5.00943,-121.668524,0.551743
3,0.0,1.0,0.0,0.0,1.0,897337.0,-1.736994,0.794152,1.222757,0.228743,-0.29956,-1.173759,0.88726,4.0,1.0,2.0,2.0,1.0,0.0,1.0,1.0,2.0,1.0,4.0,5.0,1.783402,3.499031,-2.604161,-0.314153,0.468883,2.537186,-1.553761,-1.638709,47.262215,0.55169
4,1.0,0.0,0.0,1.0,0.0,713533.0,0.22108,1.444455,-0.836671,-0.341606,0.231035,-0.967732,0.455914,3.0,3.0,1.0,4.0,3.0,5.0,1.0,0.0,0.0,0.0,5.0,6.0,-1.158251,0.596354,-0.575615,-1.495201,-2.138705,0.529145,0.781979,-1.645252,3.468672,0.603543


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,1.0,0.0,0.0,900000.0,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6.0,6.0,0.0,0.0,0.0,5.0,1.0,1.0,2.0,2.0,0.0,1.0,-1.128371,2.749347,-1.0064,-1.193879,-2.435736,-2.42743,-1.966887,5.734205,99.478419,0.487684
1,0.0,1.0,1.0,0.0,0.0,900001.0,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1.0,3.0,4.0,0.0,2.0,1.0,3.0,0.0,0.0,0.0,2.0,0.0,-4.424098,1.080762,2.382405,0.149442,1.883322,-2.848714,-0.725155,3.194219,-65.993825,0.487684
2,1.0,0.0,0.0,1.0,0.0,900002.0,0.30399,2.44511,0.246515,0.818248,0.359731,-1.331845,1.358622,3.0,3.0,4.0,1.0,1.0,3.0,2.0,2.0,4.0,1.0,0.0,3.0,-1.523864,-1.406712,-7.026098,1.312277,-5.157192,1.714005,0.585032,0.066898,-87.405622,0.487684
3,1.0,0.0,1.0,0.0,0.0,900003.0,0.154053,0.260126,-1.367092,-0.093175,-1.111034,-0.948481,1.11922,0.0,0.0,4.0,2.0,1.0,5.0,0.0,3.0,3.0,1.0,4.0,2.0,-1.404597,3.011085,-0.594532,-3.939475,1.75457,-2.364007,-1.00332,3.893099,-281.29346,0.487684
4,1.0,0.0,0.0,0.0,1.0,900004.0,-1.651904,-0.424266,-0.667356,-0.322124,-0.089462,0.181705,1.784983,2.0,2.0,2.0,0.0,0.0,3.0,0.0,1.0,2.0,0.0,2.0,2.0,-1.968516,0.100594,0.084906,-0.985736,-0.130467,-3.557893,1.210687,1.861884,25.629415,0.487684


In [65]:
# 6. fit XGBoost #

time1 = time.time()
xgb = XGBClassifier(n_estimators=100, max_depth=6, eta=0.1)
xgb.fit(X_train, y_train)
display(time.time()-time1)


display('Accuracy: ', accuracy_score(y_train,xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,xgb.predict(X_train)))
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,xgb.predict(X_test)))


40.272281885147095

'Accuracy: '

0.8432023632365679

'F1 score: '

0.8362957552146741

'Recall score: '

0.8212415175114998

'Precision score: '

0.8519122197812581

'Accuracy: '

0.8949355841848068

'F1 score: '

0.8869232608175951

'Recall score: '

0.8546417876065423

'Precision score: '

0.9217391304347826

In [66]:
# optuna hyperparameter optimization

time1 = time.time()

def objective(trial, n_splits=2, n_jobs=-1, scale_pos_weight=1, early_stopping_rounds=50):

    cv_regularizer=0.0
    # Usually values between 0.1 and 0.2 work fine.

    params = {
        "tree_method": 'gpu_hist',
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "n_estimators": trial.suggest_int("n_estimators", 100, 700),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "learning_rate": trial.suggest_uniform("learning_rate", 0.02, 0.3),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 0.95),
        "subsample": trial.suggest_uniform("subsample", 0.5, 0.95),
        "alpha": trial.suggest_loguniform("alpha", 0.1, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 0.1, 100.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-10, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 10),
        "n_jobs": n_jobs,
    }

    X = X_train
    y = y_train

    model = XGBClassifier(**params)
    rkf = KFold(n_splits=n_splits, shuffle=True)
    X_values = X.values
    y_values = y.values
    y_pred = np.zeros_like(y_values)
    y_pred_train = np.zeros_like(y_values)
    for train_index, test_index in rkf.split(X_values):
        X_A, X_B = X_values[train_index, :], X_values[test_index, :]
        y_A, y_B = y_values[train_index], y_values[test_index]
        model.fit(X_A, y_A, eval_set=[(X_B, y_B)],
                  early_stopping_rounds=early_stopping_rounds, verbose = False)
        y_pred[test_index] += model.predict(X_B)
        y_pred_train[train_index] += model.predict(X_A)
    score_train = roc_auc_score(y_train, y_pred_train)
    score_test = roc_auc_score(y_train, y_pred) 
    overfit = score_train-score_test
    #return (f1_score_test)
    return (score_test-cv_regularizer*overfit)


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)
print('Total time for hypermarameter optimization ', time.time()-time1)
hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")

optuna_hyperpars = study.best_params
optuna_hyperpars['tree_method']='gpu_hist'
optuna_hyperpars['scale_pos_weight']=1
#optuna_hyperpars['early_stopping_rounds']=50

optuna_xgb = XGBClassifier(**optuna_hyperpars)
optuna_xgb.fit(X_train, y_train)


[32m[I 2022-07-21 20:57:36,676][0m A new study created in memory with name: no-name-95457543-78fb-4ed0-8c05-fa6cf96a2a9a[0m
[32m[I 2022-07-21 20:57:43,964][0m Trial 0 finished with value: 0.816484850194397 and parameters: {'n_estimators': 318, 'max_depth': 9, 'learning_rate': 0.25867049673920145, 'colsample_bytree': 0.4233331796909727, 'subsample': 0.8383931295882981, 'alpha': 1.4076083921325706, 'lambda': 0.5088893912547859, 'gamma': 0.00014940658251675658, 'min_child_weight': 8.54344654547678}. Best is trial 0 with value: 0.816484850194397.[0m
[32m[I 2022-07-21 20:57:47,862][0m Trial 1 finished with value: 0.8038752098333847 and parameters: {'n_estimators': 577, 'max_depth': 4, 'learning_rate': 0.2956755436931472, 'colsample_bytree': 0.1830938958001747, 'subsample': 0.8091201808179262, 'alpha': 0.8889298597195985, 'lambda': 0.5476117927581627, 'gamma': 1.3488768046559292e-08, 'min_child_weight': 1.4529163839183699}. Best is trial 0 with value: 0.816484850194397.[0m
[32m[I 2

Total time for hypermarameter optimization  508.5748300552368
        n_estimators : 608
           max_depth : 10
       learning_rate : 0.06656314832034932
    colsample_bytree : 0.8816259324577717
           subsample : 0.8817316216976288
               alpha : 1.2014935706867473
              lambda : 0.19186091410699885
               gamma : 5.715035032629268e-09
    min_child_weight : 0.611786099821499
best objective value : 0.8470746476008117


XGBClassifier(alpha=1.2014935706867473, base_score=0.5, booster='gbtree',
              callbacks=None, colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.8816259324577717, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None,
              gamma=5.715035032629268e-09, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              lambda=0.19186091410699885, learning_rate=0.06656314832034932,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=10,
              max_leaves=0, min_child_weight=0.611786099821499, missing=nan,
              monotone_constraints='()', n_estimators=608, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [71]:
display('Accuracy: ', accuracy_score(y_train,optuna_xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,optuna_xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,optuna_xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,optuna_xgb.predict(X_train)))
display('ROCAUC score: ', roc_auc_score(y_train,optuna_xgb.predict(X_train)))
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,optuna_xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,optuna_xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,optuna_xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,optuna_xgb.predict(X_test)))
display('ROCAUC score: ', roc_auc_score(y_test,optuna_xgb.predict(X_test)))

'Accuracy: '

0.9964351553650357

'F1 score: '

0.9963425055546061

'Recall score: '

0.9956278180079245

'Precision score: '

0.9970582198809606

'ROCAUC score: '

0.9964157472073786

'Accuracy: '

0.9984451354953354

'F1 score: '

0.9983859810929213

'Recall score: '

0.9974660216539968

'Precision score: '

0.9993076390491576

'ROCAUC score: '

0.9984113295059605

In [68]:
display(X_test.head(), pred.head())

Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,0.0,1.0,0.0,622560.0,0.208312,-0.866936,0.623831,-1.10716,0.62046,0.067716,1.33368,3.0,5.0,2.0,4.0,0.0,6.0,6.0,4.0,0.0,2.0,0.0,2.0,3.131442,-0.97839,-3.526296,3.380634,1.102245,2.480529,-0.203223,-3.078405,281.346418,0.603575
1,1.0,0.0,1.0,0.0,0.0,578589.0,-1.344102,0.301971,-0.635332,0.551898,1.467359,0.760738,0.750802,2.0,1.0,4.0,1.0,4.0,2.0,0.0,0.0,1.0,0.0,1.0,3.0,1.114053,-0.20775,-1.582796,-0.735319,1.263769,1.969539,1.525228,-0.235317,191.321809,0.551726
2,0.0,1.0,0.0,1.0,0.0,380948.0,-1.104382,1.345215,0.693597,1.125739,-1.039118,-1.778975,-0.011934,3.0,1.0,1.0,3.0,3.0,2.0,1.0,1.0,1.0,2.0,3.0,6.0,-1.692098,6.55669,3.402469,-1.450112,0.157936,-1.494861,-1.493435,5.00943,-121.668524,0.551743
3,0.0,1.0,0.0,0.0,1.0,897337.0,-1.736994,0.794152,1.222757,0.228743,-0.29956,-1.173759,0.88726,4.0,1.0,2.0,2.0,1.0,0.0,1.0,1.0,2.0,1.0,4.0,5.0,1.783402,3.499031,-2.604161,-0.314153,0.468883,2.537186,-1.553761,-1.638709,47.262215,0.55169
4,1.0,0.0,0.0,1.0,0.0,713533.0,0.22108,1.444455,-0.836671,-0.341606,0.231035,-0.967732,0.455914,3.0,3.0,1.0,4.0,3.0,5.0,1.0,0.0,0.0,0.0,5.0,6.0,-1.158251,0.596354,-0.575615,-1.495201,-2.138705,0.529145,0.781979,-1.645252,3.468672,0.603543


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,1.0,0.0,0.0,900000.0,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6.0,6.0,0.0,0.0,0.0,5.0,1.0,1.0,2.0,2.0,0.0,1.0,-1.128371,2.749347,-1.0064,-1.193879,-2.435736,-2.42743,-1.966887,5.734205,99.478419,0.487684
1,0.0,1.0,1.0,0.0,0.0,900001.0,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1.0,3.0,4.0,0.0,2.0,1.0,3.0,0.0,0.0,0.0,2.0,0.0,-4.424098,1.080762,2.382405,0.149442,1.883322,-2.848714,-0.725155,3.194219,-65.993825,0.487684
2,1.0,0.0,0.0,1.0,0.0,900002.0,0.30399,2.44511,0.246515,0.818248,0.359731,-1.331845,1.358622,3.0,3.0,4.0,1.0,1.0,3.0,2.0,2.0,4.0,1.0,0.0,3.0,-1.523864,-1.406712,-7.026098,1.312277,-5.157192,1.714005,0.585032,0.066898,-87.405622,0.487684
3,1.0,0.0,1.0,0.0,0.0,900003.0,0.154053,0.260126,-1.367092,-0.093175,-1.111034,-0.948481,1.11922,0.0,0.0,4.0,2.0,1.0,5.0,0.0,3.0,3.0,1.0,4.0,2.0,-1.404597,3.011085,-0.594532,-3.939475,1.75457,-2.364007,-1.00332,3.893099,-281.29346,0.487684
4,1.0,0.0,0.0,0.0,1.0,900004.0,-1.651904,-0.424266,-0.667356,-0.322124,-0.089462,0.181705,1.784983,2.0,2.0,2.0,0.0,0.0,3.0,0.0,1.0,2.0,0.0,2.0,2.0,-1.968516,0.100594,0.084906,-0.985736,-0.130467,-3.557893,1.210687,1.861884,25.629415,0.487684


In [69]:
# 7. Do FI analysis 

In [70]:
# 8. Generate predictions

submission_df_bt = pd.DataFrame({'id': pred0.id, 'target': optuna_xgb.predict(pred)}, columns=['id', 'target'])
submission_df_bt.to_csv('KP20_PGS0522.csv',index=False)

os.chdir(r'/kaggle/working')

from IPython.display import FileLink
FileLink(r'KP20_PGS0522.csv')