Outline:
1. Load evth.
2. Preclean data.
3. Train-test split.
4. Missing values.
5. FE.
6. Modeling.
7. FI.
8. predictions.

In [83]:
import pandas as pd
import numpy as np
import seaborn as sns
import os, time, warnings, gzip, gc, random, math, shap, pickle, optuna
from IPython.display import display
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.inspection import permutation_importance
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_rows',200)

warnings.filterwarnings("ignore")

# target encoding code:

### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded


In [84]:
def TargetEncoderMP(train_set, test_set, feature_cols, target_col, M=5):
    
    """ This function implements terget encoding on train-test split with 2-fold CV
    It relies on previously defined CrossFoldEncoder, stolen from somehwere on SO.
    m parameter controls smoothing and is defaulted at 5 """
    
    encoder = CrossFoldEncoder(MEstimateEncoder, m=M)
    train_encoded = encoder.fit_transform(train_set, train_set[target_col], cols=feature_cols)
    test_encoded = encoder.transform(test_set)

    train_set.drop(columns=feature_cols, inplace=True)
    test_set.drop(columns=feature_cols, inplace=True)
    train_set = pd.concat([train_set, train_encoded], axis = 1)
    test_set = pd.concat([test_set, test_encoded], axis = 1)
    
    return ([train_set, test_set])
    

In [85]:
# 1. Load data #

train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv',
                   skiprows=lambda i: i>0 and random.random() > 0.99)
pred = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
pred0 = pred.copy()
display(train.shape, train.head())

display(train.target.value_counts())
num_cols = [col for col in train.columns if train[col].nunique()>10]
num_cols.remove('f_27')
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
print('num_cols: ', num_cols, '\n', 'cat_cols: ', cat_cols)
display(train[num_cols].describe())
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
display([train[col].value_counts() for col in cat_cols])

#[train[col].value_counts() for col in train.columns if train[col].nunique()<10]
#display(train.count())

(891020, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,1,3,3,3,1,6,1,0,7,4,0.298218,-0.919717,3.058541,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,0,2,3,0,1,0,4,6,0,-3.147667,-1.075434,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1
2,2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,2,6,6,4,3,1,2,2,1,4,2.820733,-3.485342,-0.784235,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,AAAEABCKAD,-195.599702,0,2,1
3,3,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3,2,1,0,1,6,4,2,3,3,0,3,1.081084,-2.100177,-2.343819,0.572594,-1.653213,1.686035,-2.533098,-0.608601,BDBBAACBCB,210.826205,0,0,1
4,4,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3,3,0,4,3,0,6,0,3,3,1,0,-0.126179,0.605033,1.133665,-3.912929,-1.430366,2.127649,-3.306784,4.371371,BDBCBBCHFE,-217.211798,0,1,1


0    457597
1    433423
Name: target, dtype: int64

num_cols:  ['id', 'f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28'] 
 cat_cols:  ['f_30', 'f_29', 'f_27']


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28
count,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0,891020.0
mean,449984.004526,-8.8e-05,0.001194,0.001177,-0.001364,-0.000488,0.000179,-0.000911,2.031541,2.05775,2.362579,2.177461,1.803263,2.842216,2.239833,1.514703,2.100973,2.096549,1.858505,2.065082,0.308551,-0.178777,-0.156338,-0.008953,-0.369601,-0.343051,0.176989,0.357605,-0.407381
std,259799.271577,0.998947,0.99916,1.000496,1.000169,1.000151,1.000025,0.999853,1.656317,1.590897,1.63764,1.645734,1.53739,1.762582,1.538654,1.359234,1.569028,1.560135,1.467632,1.564691,2.316122,2.400436,2.484587,2.450652,2.453118,2.386664,2.417084,2.475709,238.757201
min,0.0,-4.599856,-4.682199,-4.642676,-4.658816,-4.748501,-4.750214,-4.842919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.280941,-11.257917,-13.310146,-11.85353,-12.301097,-11.416189,-11.918306,-14.300577,-1229.753052
25%,224997.75,-0.675399,-0.675036,-0.674329,-0.676004,-0.675813,-0.673713,-0.675076,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,-1.236098,-1.80469,-1.820409,-1.644913,-2.019768,-1.956176,-1.439895,-1.261643,-159.417211
50%,450018.5,0.001399,0.002085,0.002107,-0.002063,-0.001656,-0.000673,-0.001688,2.0,2.0,2.0,2.0,2.0,3.0,2.0,1.0,2.0,2.0,2.0,2.0,0.330109,-0.190407,-0.152782,0.031265,-0.39127,-0.340776,0.161212,0.404247,-0.551254
75%,674944.25,0.674496,0.674876,0.677475,0.672561,0.673886,0.675213,0.674396,3.0,3.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,1.880655,1.44421,1.507343,1.662118,1.255132,1.266065,1.796373,2.028343,158.94722
max,899999.0,4.749301,4.815699,4.961982,4.338156,4.948983,4.971881,4.822668,15.0,16.0,13.0,14.0,13.0,16.0,12.0,14.0,14.0,15.0,14.0,13.0,12.079667,11.475325,12.029242,11.34408,12.2471,12.389844,12.529179,12.913041,1229.562577


[2    299974
 0    297628
 1    293418
 Name: f_30, dtype: int64,
 0    583083
 1    307937
 Name: f_29, dtype: int64,
 BBBBBBCJBC    12
 BCBBBBCLBC    12
 BBBBBBDKBC    10
 BBBBBABLCB    10
 BBBBBBDPCB    10
               ..
 ACADBACMEB     1
 BABDCADHBE     1
 ACBABABEAH     1
 BABDABDKDD     1
 BCAACADSCE     1
 Name: f_27, Length: 735092, dtype: int64]

In [86]:
# 3. split data #

#train_test_split approach does not work when I use TE.

test_size = 0.1
train.reset_index(inplace=True, drop=True)
test_index = random.sample(list(train.index), int(test_size*train.shape[0]))
train_ = train.iloc[list(set(train.index)-set(test_index))]
test = train.iloc[test_index]
display(train.shape, test.shape, train.head(3), test.head(3))

train0, test0 = train.copy(), test.copy()

(891020, 33)

(89102, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,1,3,3,3,1,6,1,0,7,4,0.298218,-0.919717,3.058541,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,0,2,3,0,1,0,4,6,0,-3.147667,-1.075434,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1
2,2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,2,6,6,4,3,1,2,2,1,4,2.820733,-3.485342,-0.784235,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,AAAEABCKAD,-195.599702,0,2,1


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
815368,823563,0.126825,-0.620277,-0.131036,-0.35646,-2.102414,0.951481,-0.453106,10,1,2,1,1,6,4,1,3,4,2,3,1.412335,4.969106,1.580168,-2.143728,0.835757,-3.194409,0.338456,-5.802652,BDBBAABDDC,-122.75284,0,1,0
264653,267299,0.197548,-1.232319,0.33711,-0.294354,0.17481,0.690658,-0.197091,5,2,1,2,3,2,4,1,3,2,1,1,-0.032126,-3.166331,1.637605,-2.176246,1.429549,-1.865197,1.341617,-3.158002,BABBAABTBD,99.577633,0,2,0
841839,850323,1.741605,0.026155,-0.44354,-1.405589,-1.379177,-1.184869,0.066502,2,1,1,4,4,3,2,0,0,3,2,3,-0.866229,0.838712,0.767533,-2.75464,6.514849,0.133619,2.648703,2.929487,AAAECBAPBC,-111.43888,0,0,1


In [87]:
# 5. FE #

# first do TE

train, test = train0.copy(), test0.copy()
# create noise target in pred set to be discarded later. needed to avoid error due to dimension mismatch.
pred['target'] = 1

display(train.head(), test.head())
_, pred = TargetEncoderMP(train, pred, ['f_27'], 'target')
train, test = TargetEncoderMP(train0, test, ['f_27'], 'target')
display(train.head(), test.head(), pred.head())

# then extract a target

X_train = train.copy()
y_train = X_train.pop('target')
X_test = test.copy()
y_test = X_test.pop('target')

# then do OHE with columntransformer

ohe_cols = ['f_29', 'f_30']
feature_transformer = ColumnTransformer([
   ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), ohe_cols)],
   remainder="passthrough")
print('Number of features before transaformation: ', X_train.shape)
X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
pred = pd.DataFrame(feature_transformer.transform(pred), columns=feature_transformer.get_feature_names_out())
display(X_train.head(), X_test.head(), pred.head())


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,1,3,3,3,1,6,1,0,7,4,0.298218,-0.919717,3.058541,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,0,2,3,0,1,0,4,6,0,-3.147667,-1.075434,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1
2,2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,2,6,6,4,3,1,2,2,1,4,2.820733,-3.485342,-0.784235,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,AAAEABCKAD,-195.599702,0,2,1
3,3,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3,2,1,0,1,6,4,2,3,3,0,3,1.081084,-2.100177,-2.343819,0.572594,-1.653213,1.686035,-2.533098,-0.608601,BDBBAACBCB,210.826205,0,0,1
4,4,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3,3,0,4,3,0,6,0,3,3,1,0,-0.126179,0.605033,1.133665,-3.912929,-1.430366,2.127649,-3.306784,4.371371,BDBCBBCHFE,-217.211798,0,1,1


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
815368,823563,0.126825,-0.620277,-0.131036,-0.35646,-2.102414,0.951481,-0.453106,10,1,2,1,1,6,4,1,3,4,2,3,1.412335,4.969106,1.580168,-2.143728,0.835757,-3.194409,0.338456,-5.802652,BDBBAABDDC,-122.75284,0,1,0
264653,267299,0.197548,-1.232319,0.33711,-0.294354,0.17481,0.690658,-0.197091,5,2,1,2,3,2,4,1,3,2,1,1,-0.032126,-3.166331,1.637605,-2.176246,1.429549,-1.865197,1.341617,-3.158002,BABBAABTBD,99.577633,0,2,0
841839,850323,1.741605,0.026155,-0.44354,-1.405589,-1.379177,-1.184869,0.066502,2,1,1,4,4,3,2,0,0,3,2,3,-0.866229,0.838712,0.767533,-2.75464,6.514849,0.133619,2.648703,2.929487,AAAECBAPBC,-111.43888,0,0,1
398155,402154,0.374287,-0.445512,0.757943,-0.161802,0.603074,0.677288,-0.080342,7,5,4,1,1,6,3,3,5,0,2,0,2.480517,-4.545289,-2.489779,-1.19125,0.555626,-3.516975,3.517015,-3.086237,ACBCAABGII,160.50189,0,2,1
808639,816779,-0.75933,0.99566,-0.295559,0.722999,-0.721251,-0.931049,-0.657053,3,2,1,1,3,3,0,0,0,7,0,1,-3.195918,3.425717,-0.756569,1.69406,3.812279,-1.706587,0.648553,-1.194115,AEACDAEFAB,-22.927664,1,0,0


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,1,3,3,3,1,6,1,0,7,4,0.298218,-0.919717,3.058541,-2.540739,0.766952,-2.730628,-0.208177,1.363402,67.609153,0,0,0,0.405443
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,0,2,3,0,1,0,4,6,0,-3.147667,-1.075434,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,377.096415,0,0,1,0.486532
2,2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,2,6,6,4,3,1,2,2,1,4,2.820733,-3.485342,-0.784235,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,-195.599702,0,2,1,0.486532
3,3,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3,2,1,0,1,6,4,2,3,3,0,3,1.081084,-2.100177,-2.343819,0.572594,-1.653213,1.686035,-2.533098,-0.608601,210.826205,0,0,1,0.57211
4,4,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3,3,0,4,3,0,6,0,3,3,1,0,-0.126179,0.605033,1.133665,-3.912929,-1.430366,2.127649,-3.306784,4.371371,-217.211798,0,1,1,0.486532


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
815368,823563,0.126825,-0.620277,-0.131036,-0.35646,-2.102414,0.951481,-0.453106,10,1,2,1,1,6,4,1,3,4,2,3,1.412335,4.969106,1.580168,-2.143728,0.835757,-3.194409,0.338456,-5.802652,-122.75284,0,1,0,0.376416
264653,267299,0.197548,-1.232319,0.33711,-0.294354,0.17481,0.690658,-0.197091,5,2,1,2,3,2,4,1,3,2,1,1,-0.032126,-3.166331,1.637605,-2.176246,1.429549,-1.865197,1.341617,-3.158002,99.577633,0,2,0,0.425634
841839,850323,1.741605,0.026155,-0.44354,-1.405589,-1.379177,-1.184869,0.066502,2,1,1,4,4,3,2,0,0,3,2,3,-0.866229,0.838712,0.767533,-2.75464,6.514849,0.133619,2.648703,2.929487,-111.43888,0,0,1,0.550638
398155,402154,0.374287,-0.445512,0.757943,-0.161802,0.603074,0.677288,-0.080342,7,5,4,1,1,6,3,3,5,0,2,0,2.480517,-4.545289,-2.489779,-1.19125,0.555626,-3.516975,3.517015,-3.086237,160.50189,0,2,1,0.550634
808639,816779,-0.75933,0.99566,-0.295559,0.722999,-0.721251,-0.931049,-0.657053,3,2,1,1,3,3,0,0,0,7,0,1,-3.195918,3.425717,-0.756569,1.69406,3.812279,-1.706587,0.648553,-1.194115,-22.927664,1,0,0,0.425638


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
0,900000,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6,6,0,0,0,5,1,1,2,2,0,1,-1.128371,2.749347,-1.0064,-1.193879,-2.435736,-2.42743,-1.966887,5.734205,99.478419,0,0,1,0.486435
1,900001,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1,3,4,0,2,1,3,0,0,0,2,0,-4.424098,1.080762,2.382405,0.149442,1.883322,-2.848714,-0.725155,3.194219,-65.993825,1,0,1,0.486435
2,900002,0.30399,2.44511,0.246515,0.818248,0.359731,-1.331845,1.358622,3,3,4,1,1,3,2,2,4,1,0,3,-1.523864,-1.406712,-7.026098,1.312277,-5.157192,1.714005,0.585032,0.066898,-87.405622,0,1,1,0.486435
3,900003,0.154053,0.260126,-1.367092,-0.093175,-1.111034,-0.948481,1.11922,0,0,4,2,1,5,0,3,3,1,4,2,-1.404597,3.011085,-0.594532,-3.939475,1.75457,-2.364007,-1.00332,3.893099,-281.29346,0,0,1,0.376416
4,900004,-1.651904,-0.424266,-0.667356,-0.322124,-0.089462,0.181705,1.784983,2,2,2,0,0,3,0,1,2,0,2,2,-1.968516,0.100594,0.084906,-0.985736,-0.130467,-3.557893,1.210687,1.861884,25.629415,0,2,1,0.486435


Number of features before transaformation:  (891020, 32)


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,1.0,0.0,0.0,0.0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1.0,5.0,1.0,3.0,3.0,3.0,1.0,6.0,1.0,0.0,7.0,4.0,0.298218,-0.919717,3.058541,-2.540739,0.766952,-2.730628,-0.208177,1.363402,67.609153,0.405443
1,1.0,0.0,1.0,0.0,0.0,1.0,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1.0,3.0,4.0,0.0,2.0,3.0,0.0,1.0,0.0,4.0,6.0,0.0,-3.147667,-1.075434,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,377.096415,0.486532
2,1.0,0.0,0.0,0.0,1.0,2.0,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1.0,0.0,2.0,6.0,6.0,4.0,3.0,1.0,2.0,2.0,1.0,4.0,2.820733,-3.485342,-0.784235,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,-195.599702,0.486532
3,1.0,0.0,1.0,0.0,0.0,3.0,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3.0,2.0,1.0,0.0,1.0,6.0,4.0,2.0,3.0,3.0,0.0,3.0,1.081084,-2.100177,-2.343819,0.572594,-1.653213,1.686035,-2.533098,-0.608601,210.826205,0.57211
4,1.0,0.0,0.0,1.0,0.0,4.0,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3.0,3.0,0.0,4.0,3.0,0.0,6.0,0.0,3.0,3.0,1.0,0.0,-0.126179,0.605033,1.133665,-3.912929,-1.430366,2.127649,-3.306784,4.371371,-217.211798,0.486532


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,0.0,1.0,0.0,823563.0,0.126825,-0.620277,-0.131036,-0.35646,-2.102414,0.951481,-0.453106,10.0,1.0,2.0,1.0,1.0,6.0,4.0,1.0,3.0,4.0,2.0,3.0,1.412335,4.969106,1.580168,-2.143728,0.835757,-3.194409,0.338456,-5.802652,-122.75284,0.376416
1,1.0,0.0,0.0,0.0,1.0,267299.0,0.197548,-1.232319,0.33711,-0.294354,0.17481,0.690658,-0.197091,5.0,2.0,1.0,2.0,3.0,2.0,4.0,1.0,3.0,2.0,1.0,1.0,-0.032126,-3.166331,1.637605,-2.176246,1.429549,-1.865197,1.341617,-3.158002,99.577633,0.425634
2,1.0,0.0,1.0,0.0,0.0,850323.0,1.741605,0.026155,-0.44354,-1.405589,-1.379177,-1.184869,0.066502,2.0,1.0,1.0,4.0,4.0,3.0,2.0,0.0,0.0,3.0,2.0,3.0,-0.866229,0.838712,0.767533,-2.75464,6.514849,0.133619,2.648703,2.929487,-111.43888,0.550638
3,1.0,0.0,0.0,0.0,1.0,402154.0,0.374287,-0.445512,0.757943,-0.161802,0.603074,0.677288,-0.080342,7.0,5.0,4.0,1.0,1.0,6.0,3.0,3.0,5.0,0.0,2.0,0.0,2.480517,-4.545289,-2.489779,-1.19125,0.555626,-3.516975,3.517015,-3.086237,160.50189,0.550634
4,0.0,1.0,1.0,0.0,0.0,816779.0,-0.75933,0.99566,-0.295559,0.722999,-0.721251,-0.931049,-0.657053,3.0,2.0,1.0,1.0,3.0,3.0,0.0,0.0,0.0,7.0,0.0,1.0,-3.195918,3.425717,-0.756569,1.69406,3.812279,-1.706587,0.648553,-1.194115,-22.927664,0.425638


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,1.0,0.0,0.0,900000.0,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6.0,6.0,0.0,0.0,0.0,5.0,1.0,1.0,2.0,2.0,0.0,1.0,-1.128371,2.749347,-1.0064,-1.193879,-2.435736,-2.42743,-1.966887,5.734205,99.478419,0.486435
1,0.0,1.0,1.0,0.0,0.0,900001.0,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1.0,3.0,4.0,0.0,2.0,1.0,3.0,0.0,0.0,0.0,2.0,0.0,-4.424098,1.080762,2.382405,0.149442,1.883322,-2.848714,-0.725155,3.194219,-65.993825,0.486435
2,1.0,0.0,0.0,1.0,0.0,900002.0,0.30399,2.44511,0.246515,0.818248,0.359731,-1.331845,1.358622,3.0,3.0,4.0,1.0,1.0,3.0,2.0,2.0,4.0,1.0,0.0,3.0,-1.523864,-1.406712,-7.026098,1.312277,-5.157192,1.714005,0.585032,0.066898,-87.405622,0.486435
3,1.0,0.0,1.0,0.0,0.0,900003.0,0.154053,0.260126,-1.367092,-0.093175,-1.111034,-0.948481,1.11922,0.0,0.0,4.0,2.0,1.0,5.0,0.0,3.0,3.0,1.0,4.0,2.0,-1.404597,3.011085,-0.594532,-3.939475,1.75457,-2.364007,-1.00332,3.893099,-281.29346,0.376416
4,1.0,0.0,0.0,0.0,1.0,900004.0,-1.651904,-0.424266,-0.667356,-0.322124,-0.089462,0.181705,1.784983,2.0,2.0,2.0,0.0,0.0,3.0,0.0,1.0,2.0,0.0,2.0,2.0,-1.968516,0.100594,0.084906,-0.985736,-0.130467,-3.557893,1.210687,1.861884,25.629415,0.486435


In [88]:
# 6. fit XGBoost #

time1 = time.time()
xgb = XGBClassifier(n_estimators=100, max_depth=6, eta=0.1, tree_method = 'gpu_hist')
xgb.fit(X_train, y_train)
display(time.time()-time1)


display('Accuracy: ', accuracy_score(y_train,xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,xgb.predict(X_train)))
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,xgb.predict(X_test)))


455.0181303024292

'Accuracy: '

0.8239433458283765

'F1 score: '

0.8168228271429638

'Recall score: '

0.8069645588720488

'Precision score: '

0.8269249410702169

'Accuracy: '

0.8988574891697156

'F1 score: '

0.8935104220825258

'Recall score: '

0.8699493787390704

'Precision score: '

0.9183832102603964

In [89]:
# optuna hyperparameter optimization

time1 = time.time()

def objective(trial, n_splits=2, n_jobs=-1, scale_pos_weight=1, early_stopping_rounds=50):

    cv_regularizer=0.0
    # Usually values between 0.1 and 0.2 work fine.

    params = {
        "tree_method": 'gpu_hist',
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "n_estimators": trial.suggest_int("n_estimators", 100, 700),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "learning_rate": trial.suggest_uniform("learning_rate", 0.02, 0.3),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 0.95),
        "subsample": trial.suggest_uniform("subsample", 0.5, 0.95),
        "alpha": trial.suggest_loguniform("alpha", 0.1, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 0.1, 100.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-10, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 10),
        "n_jobs": n_jobs,
    }

    X = X_train
    y = y_train

    model = XGBClassifier(**params)
    rkf = KFold(n_splits=n_splits, shuffle=True)
    X_values = X.values
    y_values = y.values
    y_pred = np.zeros_like(y_values)
    y_pred_train = np.zeros_like(y_values)
    for train_index, test_index in rkf.split(X_values):
        X_A, X_B = X_values[train_index, :], X_values[test_index, :]
        y_A, y_B = y_values[train_index], y_values[test_index]
        model.fit(X_A, y_A, eval_set=[(X_B, y_B)],
                  early_stopping_rounds=early_stopping_rounds, verbose = False)
        y_pred[test_index] += model.predict(X_B)
        y_pred_train[train_index] += model.predict(X_A)
    score_train = roc_auc_score(y_train, y_pred_train)
    score_test = roc_auc_score(y_train, y_pred) 
    overfit = score_train-score_test
    #return (f1_score_test)
    return (score_test-cv_regularizer*overfit)


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)
print('Total time for hypermarameter optimization ', time.time()-time1)
hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")

optuna_hyperpars = study.best_params
optuna_hyperpars['tree_method']='gpu_hist'
optuna_hyperpars['scale_pos_weight']=1
#optuna_hyperpars['early_stopping_rounds']=50

optuna_xgb = XGBClassifier(**optuna_hyperpars)
optuna_xgb.fit(X_train, y_train)


[32m[I 2022-07-21 21:41:50,214][0m A new study created in memory with name: no-name-d7c75441-4142-4116-9492-0444182d286c[0m
[32m[I 2022-07-21 21:42:11,142][0m Trial 0 finished with value: 0.8501819867062275 and parameters: {'n_estimators': 598, 'max_depth': 4, 'learning_rate': 0.2515295002738546, 'colsample_bytree': 0.5337705465491908, 'subsample': 0.5969365492287091, 'alpha': 7.841696173708269, 'lambda': 4.712361125504181, 'gamma': 0.0019484379495450263, 'min_child_weight': 0.5936175090499339}. Best is trial 0 with value: 0.8501819867062275.[0m
[32m[I 2022-07-21 21:42:28,864][0m Trial 1 finished with value: 0.8207199374290801 and parameters: {'n_estimators': 115, 'max_depth': 9, 'learning_rate': 0.12297192008950213, 'colsample_bytree': 0.3292517387147944, 'subsample': 0.5196509612619842, 'alpha': 0.5290583711811292, 'lambda': 1.309167162253788, 'gamma': 0.8683234780553785, 'min_child_weight': 1.6318811132622744}. Best is trial 0 with value: 0.8501819867062275.[0m
[32m[I 2022

Total time for hypermarameter optimization  1982.8798310756683
        n_estimators : 436
           max_depth : 10
       learning_rate : 0.11929576160331953
    colsample_bytree : 0.8688101742644472
           subsample : 0.8087924883606143
               alpha : 2.6649248334986426
              lambda : 0.15036226855883963
               gamma : 5.466129489891683e-09
    min_child_weight : 0.14163762290609413
best objective value : 0.8740766907320368


XGBClassifier(alpha=2.6649248334986426, base_score=0.5, booster='gbtree',
              callbacks=None, colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.8688101742644472, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None,
              gamma=5.466129489891683e-09, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              lambda=0.15036226855883963, learning_rate=0.11929576160331953,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=10,
              max_leaves=0, min_child_weight=0.14163762290609413, missing=nan,
              monotone_constraints='()', n_estimators=436, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [90]:
display('Accuracy: ', accuracy_score(y_train,optuna_xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,optuna_xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,optuna_xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,optuna_xgb.predict(X_train)))
display('ROCAUC score: ', roc_auc_score(y_train,optuna_xgb.predict(X_train)))
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,optuna_xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,optuna_xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,optuna_xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,optuna_xgb.predict(X_test)))
display('ROCAUC score: ', roc_auc_score(y_test,optuna_xgb.predict(X_test)))

'Accuracy: '

0.933468384548046

'F1 score: '

0.9312531818640418

'Recall score: '

0.9263790800211341

'Precision score: '

0.9361788447279696

'ROCAUC score: '

0.9332811271494686

'Accuracy: '

0.951751924760387

'F1 score: '

0.9495759878954221

'Recall score: '

0.931408191440405

'Precision score: '

0.9684666363614614

'ROCAUC score: '

0.9512656399119559

In [91]:
display(X_test.head(), pred.head())

Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,0.0,1.0,0.0,823563.0,0.126825,-0.620277,-0.131036,-0.35646,-2.102414,0.951481,-0.453106,10.0,1.0,2.0,1.0,1.0,6.0,4.0,1.0,3.0,4.0,2.0,3.0,1.412335,4.969106,1.580168,-2.143728,0.835757,-3.194409,0.338456,-5.802652,-122.75284,0.376416
1,1.0,0.0,0.0,0.0,1.0,267299.0,0.197548,-1.232319,0.33711,-0.294354,0.17481,0.690658,-0.197091,5.0,2.0,1.0,2.0,3.0,2.0,4.0,1.0,3.0,2.0,1.0,1.0,-0.032126,-3.166331,1.637605,-2.176246,1.429549,-1.865197,1.341617,-3.158002,99.577633,0.425634
2,1.0,0.0,1.0,0.0,0.0,850323.0,1.741605,0.026155,-0.44354,-1.405589,-1.379177,-1.184869,0.066502,2.0,1.0,1.0,4.0,4.0,3.0,2.0,0.0,0.0,3.0,2.0,3.0,-0.866229,0.838712,0.767533,-2.75464,6.514849,0.133619,2.648703,2.929487,-111.43888,0.550638
3,1.0,0.0,0.0,0.0,1.0,402154.0,0.374287,-0.445512,0.757943,-0.161802,0.603074,0.677288,-0.080342,7.0,5.0,4.0,1.0,1.0,6.0,3.0,3.0,5.0,0.0,2.0,0.0,2.480517,-4.545289,-2.489779,-1.19125,0.555626,-3.516975,3.517015,-3.086237,160.50189,0.550634
4,0.0,1.0,1.0,0.0,0.0,816779.0,-0.75933,0.99566,-0.295559,0.722999,-0.721251,-0.931049,-0.657053,3.0,2.0,1.0,1.0,3.0,3.0,0.0,0.0,0.0,7.0,0.0,1.0,-3.195918,3.425717,-0.756569,1.69406,3.812279,-1.706587,0.648553,-1.194115,-22.927664,0.425638


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,1.0,0.0,0.0,900000.0,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6.0,6.0,0.0,0.0,0.0,5.0,1.0,1.0,2.0,2.0,0.0,1.0,-1.128371,2.749347,-1.0064,-1.193879,-2.435736,-2.42743,-1.966887,5.734205,99.478419,0.486435
1,0.0,1.0,1.0,0.0,0.0,900001.0,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1.0,3.0,4.0,0.0,2.0,1.0,3.0,0.0,0.0,0.0,2.0,0.0,-4.424098,1.080762,2.382405,0.149442,1.883322,-2.848714,-0.725155,3.194219,-65.993825,0.486435
2,1.0,0.0,0.0,1.0,0.0,900002.0,0.30399,2.44511,0.246515,0.818248,0.359731,-1.331845,1.358622,3.0,3.0,4.0,1.0,1.0,3.0,2.0,2.0,4.0,1.0,0.0,3.0,-1.523864,-1.406712,-7.026098,1.312277,-5.157192,1.714005,0.585032,0.066898,-87.405622,0.486435
3,1.0,0.0,1.0,0.0,0.0,900003.0,0.154053,0.260126,-1.367092,-0.093175,-1.111034,-0.948481,1.11922,0.0,0.0,4.0,2.0,1.0,5.0,0.0,3.0,3.0,1.0,4.0,2.0,-1.404597,3.011085,-0.594532,-3.939475,1.75457,-2.364007,-1.00332,3.893099,-281.29346,0.376416
4,1.0,0.0,0.0,0.0,1.0,900004.0,-1.651904,-0.424266,-0.667356,-0.322124,-0.089462,0.181705,1.784983,2.0,2.0,2.0,0.0,0.0,3.0,0.0,1.0,2.0,0.0,2.0,2.0,-1.968516,0.100594,0.084906,-0.985736,-0.130467,-3.557893,1.210687,1.861884,25.629415,0.486435


In [92]:
# 7. Do FI analysis 

In [100]:
# 8. Generate predictions

submission_df_bt = pd.DataFrame({'id': pred0.id, 'target': (optuna_xgb.predict_proba(pred))[:,1]}, columns=['id', 'target'])
submission_df_bt.to_csv('KP20_PGS0522.csv',index=False)

os.chdir(r'/kaggle/working')

from IPython.display import FileLink
FileLink(r'KP20_PGS0522.csv')

In [101]:
submission_df_bt

Unnamed: 0,id,target
0,900000,0.853487
1,900001,0.957309
2,900002,0.001397
3,900003,0.029376
4,900004,0.714077
...,...,...
699995,1599995,0.292754
699996,1599996,0.905898
699997,1599997,0.399087
699998,1599998,0.117940


In [99]:
optuna_xgb.predict_proba(pred)

array([[1.4651316e-01, 8.5348684e-01],
       [4.2690516e-02, 9.5730948e-01],
       [9.9860334e-01, 1.3966311e-03],
       ...,
       [6.0091317e-01, 3.9908686e-01],
       [8.8205975e-01, 1.1794024e-01],
       [9.9925679e-01, 7.4319844e-04]], dtype=float32)