Outline:
1. Load evth.
2. Preclean data.
3. Train-test split.
4. Missing values.
5. FE.
6. Modeling.
7. FI.
8. predictions.

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import os, time, warnings, gzip, gc, random, math, shap, pickle, optuna
from IPython.display import display
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.inspection import permutation_importance
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_rows',200)

warnings.filterwarnings("ignore")

# target encoding code:

### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded


ImportError: cannot import name 'auc_precision_recall' from 'sklearn.metrics' (/opt/conda/lib/python3.7/site-packages/sklearn/metrics/__init__.py)

In [6]:
def TargetEncoderMP(train_set, test_set, feature_cols, target_col, M=5):
    
    """ This function implements terget encoding on train-test split with 2-fold CV
    It relies on previously defined CrossFoldEncoder, stolen from somehwere on SO.
    m parameter controls smoothing and is defaulted at 5 """
    
    encoder = CrossFoldEncoder(MEstimateEncoder, m=M)
    train_encoded = encoder.fit_transform(train_set, train_set[target_col], cols=feature_cols)
    test_encoded = encoder.transform(test_set)

    train_set.drop(columns=feature_cols, inplace=True)
    test_set.drop(columns=feature_cols, inplace=True)
    train_set = pd.concat([train_set, train_encoded], axis = 1)
    test_set = pd.concat([test_set, test_encoded], axis = 1)
    
    return ([train_set, test_set])
    

In [7]:
# 1. Load data #

train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv',
                   skiprows=lambda i: i>0 and random.random() > 0.1)
display(train.shape, train.head())

display(train.target.value_counts())
num_cols = [col for col in train.columns if train[col].nunique()>10]
num_cols.remove('f_27')
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
print('num_cols: ', num_cols, '\n', 'cat_cols: ', cat_cols)
display(train[num_cols].describe())
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
display([train[col].value_counts() for col in cat_cols])

#[train[col].value_counts() for col in train.columns if train[col].nunique()<10]
#display(train.count())

(89836, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,0,2,3,0,1,0,4,6,0,-3.147667,-1.075434,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1
1,30,-0.239164,0.34637,-1.405502,0.252331,0.82904,0.680469,-0.053596,1,2,1,2,2,5,4,3,5,1,1,1,-1.174678,3.179138,-4.384276,0.813502,-3.635928,2.720842,2.873769,-2.943167,ACBCBBARAE,7.473229,0,2,0
2,31,-1.860566,-0.860656,0.711783,-1.88437,0.505121,1.08628,-0.901086,3,1,3,3,1,4,2,1,0,1,1,0,2.400315,1.709712,-3.370536,1.041986,-2.739856,2.177456,-5.309015,-2.365721,BDBBCBBGDC,1.313627,0,1,0
3,40,-0.950887,0.809457,0.411326,0.983356,0.529987,0.939055,-0.876306,2,1,2,3,2,4,2,1,3,3,3,0,1.173702,0.198167,-2.003606,5.910205,-2.872623,3.565474,-2.493328,-2.106116,ADABABCHDA,105.18057,0,2,1
4,47,-0.25368,-0.259053,0.151535,-0.406073,1.618217,-2.659185,0.638532,0,4,0,0,1,3,0,4,1,2,0,0,4.817508,3.367342,2.252013,0.272049,5.519552,-3.13389,0.884206,-0.919681,ACAAAACNCD,84.792093,0,2,1


0    46217
1    43619
Name: target, dtype: int64

num_cols:  ['id', 'f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28'] 
 cat_cols:  ['f_30', 'f_29', 'f_27']


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28
count,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0,89836.0
mean,449915.009929,-0.004467,0.00311,0.002252,-0.00142,-0.004297,-0.001576,-0.000661,2.030968,2.064362,2.362438,2.182722,1.802518,2.839151,2.245358,1.513224,2.092891,2.091311,1.86098,2.061056,0.308727,-0.185691,-0.158789,-0.013109,-0.368639,-0.341766,0.181015,0.362236,-0.823942
std,259050.766902,0.996714,0.999251,1.002254,0.998652,1.003038,1.003039,1.004229,1.653087,1.59105,1.641407,1.647961,1.536609,1.761236,1.541637,1.361229,1.564195,1.556484,1.462577,1.561546,2.314823,2.406471,2.480631,2.452499,2.447073,2.389505,2.42433,2.479164,239.168629
min,1.0,-4.337005,-4.617312,-4.351819,-4.222918,-4.215718,-4.372231,-4.269762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-10.622076,-11.146797,-11.369032,-10.26708,-11.353846,-10.205817,-10.608082,-12.333948,-995.606338
25%,226624.5,-0.680377,-0.671123,-0.671809,-0.680197,-0.680597,-0.678398,-0.677078,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,-1.24588,-1.818989,-1.80837,-1.648489,-2.011356,-1.951943,-1.450739,-1.265294,-159.30944
50%,450073.0,-0.005111,0.004047,0.003978,-0.002033,-0.009987,-0.005057,-0.005707,2.0,2.0,2.0,2.0,2.0,3.0,2.0,1.0,2.0,2.0,2.0,2.0,0.332961,-0.188765,-0.157444,0.023956,-0.38887,-0.332086,0.161058,0.409462,-0.582054
75%,673533.25,0.670522,0.680139,0.68199,0.674281,0.672201,0.676434,0.674695,3.0,3.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,1.881243,1.448625,1.501501,1.657337,1.249316,1.272343,1.816165,2.036149,158.409682
max,899999.0,4.186919,4.405091,3.981683,4.277234,4.948983,4.363519,4.338643,12.0,16.0,12.0,12.0,12.0,13.0,12.0,10.0,12.0,11.0,11.0,13.0,11.034011,9.649709,10.543227,11.34408,12.07022,10.500941,10.7717,10.535765,1092.295038


[2    30043
 0    30014
 1    29779
 Name: f_30, dtype: int64,
 0    58705
 1    31131
 Name: f_29, dtype: int64,
 BBBBBABLBD    4
 ACBAAABQCB    4
 BAAABABPBB    4
 ABBCABBACC    3
 BCBBBBCJDB    3
              ..
 AEBDABCACE    1
 BABBCBCIBD    1
 AEBBBADRBB    1
 BBAAAABHCC    1
 BCAACADSCE    1
 Name: f_27, Length: 87746, dtype: int64]

In [8]:
# 3. split data #

#train_test_split approach does not work when I use TE.

test_size = 0.1
train.reset_index(inplace=True, drop=True)
test_index = random.sample(list(train.index), int(test_size*train.shape[0]))
train_ = train.iloc[list(set(train.index)-set(test_index))]
test = train.iloc[test_index]
display(train.shape, test.shape, train.head(3), test.head(3))

train0, test0 = train.copy(), test.copy()

(89836, 33)

(8983, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,0,2,3,0,1,0,4,6,0,-3.147667,-1.075434,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1
1,30,-0.239164,0.34637,-1.405502,0.252331,0.82904,0.680469,-0.053596,1,2,1,2,2,5,4,3,5,1,1,1,-1.174678,3.179138,-4.384276,0.813502,-3.635928,2.720842,2.873769,-2.943167,ACBCBBARAE,7.473229,0,2,0
2,31,-1.860566,-0.860656,0.711783,-1.88437,0.505121,1.08628,-0.901086,3,1,3,3,1,4,2,1,0,1,1,0,2.400315,1.709712,-3.370536,1.041986,-2.739856,2.177456,-5.309015,-2.365721,BDBBCBBGDC,1.313627,0,1,0


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
88804,889644,-1.894842,0.612225,-0.878064,-0.944102,0.589081,-1.079761,0.989459,0,1,4,1,4,1,3,3,3,0,1,4,1.208303,-2.165976,0.668678,-2.085581,-2.070507,-2.434997,2.818719,2.988354,ABBCCAAHCC,-364.327749,0,0,0
83888,839855,1.172646,0.749445,0.303391,1.569207,0.432175,0.181678,-0.824584,1,1,2,0,1,2,2,0,2,1,1,4,-1.471634,-1.892546,-1.991294,1.453444,1.136771,-2.466239,-0.018656,3.096894,BDBBCBFDFA,126.210582,1,2,1
88759,889302,1.361345,-0.296016,2.013206,0.300235,-0.071533,-1.23148,-0.279857,0,0,4,1,4,3,1,0,2,4,3,1,0.892503,1.285205,-3.054236,0.781007,-2.615113,0.675957,-1.247549,0.914018,AEBDCBBHEE,5.789978,0,1,0


In [9]:
# 5. FE #

# first do TE

train, test = train0.copy(), test0.copy()

display(train.head(), test.head())
train, test = TargetEncoderMP(train, test, ['f_27'], 'target')
display(train.head(), test.head())

# then extract a target

X_train = train.copy()
y_train = X_train.pop('target')
X_test = test.copy()
y_test = X_test.pop('target')

# then do OHE with columntransformer

ohe_cols = ['f_29', 'f_30']
feature_transformer = ColumnTransformer([
   ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), ohe_cols)],
   remainder="passthrough")
print('Number of features before transaformation: ', X_train.shape)
X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
display(X_train.head(), X_test.head())


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,0,2,3,0,1,0,4,6,0,-3.147667,-1.075434,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1
1,30,-0.239164,0.34637,-1.405502,0.252331,0.82904,0.680469,-0.053596,1,2,1,2,2,5,4,3,5,1,1,1,-1.174678,3.179138,-4.384276,0.813502,-3.635928,2.720842,2.873769,-2.943167,ACBCBBARAE,7.473229,0,2,0
2,31,-1.860566,-0.860656,0.711783,-1.88437,0.505121,1.08628,-0.901086,3,1,3,3,1,4,2,1,0,1,1,0,2.400315,1.709712,-3.370536,1.041986,-2.739856,2.177456,-5.309015,-2.365721,BDBBCBBGDC,1.313627,0,1,0
3,40,-0.950887,0.809457,0.411326,0.983356,0.529987,0.939055,-0.876306,2,1,2,3,2,4,2,1,3,3,3,0,1.173702,0.198167,-2.003606,5.910205,-2.872623,3.565474,-2.493328,-2.106116,ADABABCHDA,105.18057,0,2,1
4,47,-0.25368,-0.259053,0.151535,-0.406073,1.618217,-2.659185,0.638532,0,4,0,0,1,3,0,4,1,2,0,0,4.817508,3.367342,2.252013,0.272049,5.519552,-3.13389,0.884206,-0.919681,ACAAAACNCD,84.792093,0,2,1


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
88804,889644,-1.894842,0.612225,-0.878064,-0.944102,0.589081,-1.079761,0.989459,0,1,4,1,4,1,3,3,3,0,1,4,1.208303,-2.165976,0.668678,-2.085581,-2.070507,-2.434997,2.818719,2.988354,ABBCCAAHCC,-364.327749,0,0,0
83888,839855,1.172646,0.749445,0.303391,1.569207,0.432175,0.181678,-0.824584,1,1,2,0,1,2,2,0,2,1,1,4,-1.471634,-1.892546,-1.991294,1.453444,1.136771,-2.466239,-0.018656,3.096894,BDBBCBFDFA,126.210582,1,2,1
88759,889302,1.361345,-0.296016,2.013206,0.300235,-0.071533,-1.23148,-0.279857,0,0,4,1,4,3,1,0,2,4,3,1,0.892503,1.285205,-3.054236,0.781007,-2.615113,0.675957,-1.247549,0.914018,AEBDCBBHEE,5.789978,0,1,0
53969,539645,0.136315,0.305526,0.478734,0.004733,-0.016104,1.659186,-0.089498,0,1,2,3,2,2,5,2,3,4,1,0,-1.856273,3.942629,0.741843,1.422442,2.506919,-2.723231,-0.031818,3.986723,ADAEABCFDE,323.465472,0,1,0
17999,181711,-1.497529,-0.323693,-0.664493,0.530076,-1.363216,-1.295596,0.933052,5,3,1,1,0,9,2,0,0,2,0,7,2.749472,-1.837278,2.739889,-2.813472,-2.063277,-5.243097,1.47156,1.302824,BBAABADIAC,-177.00317,0,2,0


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
0,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,0,2,3,0,1,0,4,6,0,-3.147667,-1.075434,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,377.096415,0,0,1,0.485863
1,30,-0.239164,0.34637,-1.405502,0.252331,0.82904,0.680469,-0.053596,1,2,1,2,2,5,4,3,5,1,1,1,-1.174678,3.179138,-4.384276,0.813502,-3.635928,2.720842,2.873769,-2.943167,7.473229,0,2,0,0.485863
2,31,-1.860566,-0.860656,0.711783,-1.88437,0.505121,1.08628,-0.901086,3,1,3,3,1,4,2,1,0,1,1,0,2.400315,1.709712,-3.370536,1.041986,-2.739856,2.177456,-5.309015,-2.365721,1.313627,0,1,0,0.485863
3,40,-0.950887,0.809457,0.411326,0.983356,0.529987,0.939055,-0.876306,2,1,2,3,2,4,2,1,3,3,3,0,1.173702,0.198167,-2.003606,5.910205,-2.872623,3.565474,-2.493328,-2.106116,105.18057,0,2,1,0.485863
4,47,-0.25368,-0.259053,0.151535,-0.406073,1.618217,-2.659185,0.638532,0,4,0,0,1,3,0,4,1,2,0,0,4.817508,3.367342,2.252013,0.272049,5.519552,-3.13389,0.884206,-0.919681,84.792093,0,2,1,0.485863


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
88804,889644,-1.894842,0.612225,-0.878064,-0.944102,0.589081,-1.079761,0.989459,0,1,4,1,4,1,3,3,3,0,1,4,1.208303,-2.165976,0.668678,-2.085581,-2.070507,-2.434997,2.818719,2.988354,-364.327749,0,0,0,0.424838
83888,839855,1.172646,0.749445,0.303391,1.569207,0.432175,0.181678,-0.824584,1,1,2,0,1,2,2,0,2,1,1,4,-1.471634,-1.892546,-1.991294,1.453444,1.136771,-2.466239,-0.018656,3.096894,126.210582,1,2,1,0.549838
88759,889302,1.361345,-0.296016,2.013206,0.300235,-0.071533,-1.23148,-0.279857,0,0,4,1,4,3,1,0,2,4,3,1,0.892503,1.285205,-3.054236,0.781007,-2.615113,0.675957,-1.247549,0.914018,5.789978,0,1,0,0.424838
53969,539645,0.136315,0.305526,0.478734,0.004733,-0.016104,1.659186,-0.089498,0,1,2,3,2,2,5,2,3,4,1,0,-1.856273,3.942629,0.741843,1.422442,2.506919,-2.723231,-0.031818,3.986723,323.465472,0,1,0,0.424862
17999,181711,-1.497529,-0.323693,-0.664493,0.530076,-1.363216,-1.295596,0.933052,5,3,1,1,0,9,2,0,0,2,0,7,2.749472,-1.837278,2.739889,-2.813472,-2.063277,-5.243097,1.47156,1.302824,-177.00317,0,2,0,0.424861


Number of features before transaformation:  (89836, 32)


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,1.0,0.0,0.0,1.0,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1.0,3.0,4.0,0.0,2.0,3.0,0.0,1.0,0.0,4.0,6.0,0.0,-3.147667,-1.075434,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,377.096415,0.485863
1,1.0,0.0,0.0,0.0,1.0,30.0,-0.239164,0.34637,-1.405502,0.252331,0.82904,0.680469,-0.053596,1.0,2.0,1.0,2.0,2.0,5.0,4.0,3.0,5.0,1.0,1.0,1.0,-1.174678,3.179138,-4.384276,0.813502,-3.635928,2.720842,2.873769,-2.943167,7.473229,0.485863
2,1.0,0.0,0.0,1.0,0.0,31.0,-1.860566,-0.860656,0.711783,-1.88437,0.505121,1.08628,-0.901086,3.0,1.0,3.0,3.0,1.0,4.0,2.0,1.0,0.0,1.0,1.0,0.0,2.400315,1.709712,-3.370536,1.041986,-2.739856,2.177456,-5.309015,-2.365721,1.313627,0.485863
3,1.0,0.0,0.0,0.0,1.0,40.0,-0.950887,0.809457,0.411326,0.983356,0.529987,0.939055,-0.876306,2.0,1.0,2.0,3.0,2.0,4.0,2.0,1.0,3.0,3.0,3.0,0.0,1.173702,0.198167,-2.003606,5.910205,-2.872623,3.565474,-2.493328,-2.106116,105.18057,0.485863
4,1.0,0.0,0.0,0.0,1.0,47.0,-0.25368,-0.259053,0.151535,-0.406073,1.618217,-2.659185,0.638532,0.0,4.0,0.0,0.0,1.0,3.0,0.0,4.0,1.0,2.0,0.0,0.0,4.817508,3.367342,2.252013,0.272049,5.519552,-3.13389,0.884206,-0.919681,84.792093,0.485863


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,1.0,0.0,0.0,889644.0,-1.894842,0.612225,-0.878064,-0.944102,0.589081,-1.079761,0.989459,0.0,1.0,4.0,1.0,4.0,1.0,3.0,3.0,3.0,0.0,1.0,4.0,1.208303,-2.165976,0.668678,-2.085581,-2.070507,-2.434997,2.818719,2.988354,-364.327749,0.424838
1,0.0,1.0,0.0,0.0,1.0,839855.0,1.172646,0.749445,0.303391,1.569207,0.432175,0.181678,-0.824584,1.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0,2.0,1.0,1.0,4.0,-1.471634,-1.892546,-1.991294,1.453444,1.136771,-2.466239,-0.018656,3.096894,126.210582,0.549838
2,1.0,0.0,0.0,1.0,0.0,889302.0,1.361345,-0.296016,2.013206,0.300235,-0.071533,-1.23148,-0.279857,0.0,0.0,4.0,1.0,4.0,3.0,1.0,0.0,2.0,4.0,3.0,1.0,0.892503,1.285205,-3.054236,0.781007,-2.615113,0.675957,-1.247549,0.914018,5.789978,0.424838
3,1.0,0.0,0.0,1.0,0.0,539645.0,0.136315,0.305526,0.478734,0.004733,-0.016104,1.659186,-0.089498,0.0,1.0,2.0,3.0,2.0,2.0,5.0,2.0,3.0,4.0,1.0,0.0,-1.856273,3.942629,0.741843,1.422442,2.506919,-2.723231,-0.031818,3.986723,323.465472,0.424862
4,1.0,0.0,0.0,0.0,1.0,181711.0,-1.497529,-0.323693,-0.664493,0.530076,-1.363216,-1.295596,0.933052,5.0,3.0,1.0,1.0,0.0,9.0,2.0,0.0,0.0,2.0,0.0,7.0,2.749472,-1.837278,2.739889,-2.813472,-2.063277,-5.243097,1.47156,1.302824,-177.00317,0.424861


In [12]:
# 6. fit XGBoost #

time1 = time.time()
xgb = XGBClassifier(n_estimators=100, max_depth=6, eta=0.1)
xgb.fit(X_train, y_train)
display(time.time()-time1)


display('Accuracy: ', accuracy_score(y_train,xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,xgb.predict(X_train)))
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,xgb.predict(X_test)))


40.69318914413452

'Accuracy: '

0.8385725099069415

'F1 score: '

0.8314935743998513

'Recall score: '

0.820284738302116

'Precision score: '

0.8430129821171924

'Accuracy: '

0.8677501948124234

'F1 score: '

0.8654586636466591

'Recall score: '

0.8761751891767943

'Precision score: '

0.8550011188185276

In [None]:
# optuna hyperparameter optimization

def objective(trial, n_splits=2, n_jobs=-1, scale_pos_weight=1, early_stopping_rounds=50):

    cv_regularizer=0.0
    # Usually values between 0.1 and 0.2 work fine.

    params = {
        "tree_method": 'gpu_hist',
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "n_estimators": trial.suggest_int("n_estimators", 100, 700),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "learning_rate": trial.suggest_uniform("learning_rate", 0.02, 0.3),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 0.95),
        "subsample": trial.suggest_uniform("subsample", 0.5, 0.95),
        "alpha": trial.suggest_loguniform("alpha", 0.1, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 0.1, 100.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-10, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 10),
        "n_jobs": n_jobs,
    }

    X = X_train
    y = y_train

    model = XGBClassifier(**params)
    rkf = KFold(n_splits=n_splits, shuffle=True)
    X_values = X.values
    y_values = y.values
    y_pred = np.zeros_like(y_values)
    y_pred_train = np.zeros_like(y_values)
    for train_index, test_index in rkf.split(X_values):
        X_A, X_B = X_values[train_index, :], X_values[test_index, :]
        y_A, y_B = y_values[train_index], y_values[test_index]
        model.fit(X_A, y_A, eval_set=[(X_B, y_B)],
                  early_stopping_rounds=early_stopping_rounds, verbose = False)
        y_pred[test_index] += model.predict(X_B)
        y_pred_train[train_index] += model.predict(X_A)
    score_train = f1_score(y_train, y_pred_train)
    score_test = f1_score(y_train, y_pred) 
    overfit = score_train-score_test
    #return (f1_score_test)
    return (score_test-cv_regularizer*overfit)


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)
print('Total time for hypermarameter optimization ', time.time()-time1)
hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")

optuna_hyperpars = study.best_params
optuna_hyperpars['tree_method']='gpu_hist'
optuna_hyperpars['scale_pos_weight']=1
optuna_hyperpars['early_stopping_rounds']=50

optuna_xgb = XGBClassifier(**optuna_hyperpars)
optuna_xgb.fit(X_train, y_train)


[32m[I 2022-07-21 19:48:43,467][0m A new study created in memory with name: no-name-2f4a45fa-840e-4f7f-afb3-f013ec4e1e89[0m
[32m[I 2022-07-21 19:48:51,744][0m Trial 0 finished with value: 0.7902703591193354 and parameters: {'n_estimators': 599, 'max_depth': 6, 'learning_rate': 0.07390402394447462, 'colsample_bytree': 0.18220547456300884, 'subsample': 0.6512349455111186, 'alpha': 4.459326219142105, 'lambda': 2.671444943577002, 'gamma': 3.767326071352863e-10, 'min_child_weight': 2.0081514807307808}. Best is trial 0 with value: 0.7902703591193354.[0m
[32m[I 2022-07-21 19:49:04,301][0m Trial 1 finished with value: 0.8215635146623904 and parameters: {'n_estimators': 277, 'max_depth': 10, 'learning_rate': 0.13553023270800407, 'colsample_bytree': 0.8460899199265409, 'subsample': 0.7792979085941842, 'alpha': 1.081147265336026, 'lambda': 3.4551611285862864, 'gamma': 0.44365066241530016, 'min_child_weight': 3.8161067582276216}. Best is trial 1 with value: 0.8215635146623904.[0m
[32m[I 

In [None]:
# 7. Do FI analysisany