Outline:
1. Load evth.
2. Preclean data.
3. Train-test split.
4. Missing values.
5. FE.
6. Modeling.
7. FI.
8. predictions.

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import os, time, warnings, gzip, gc, random, math, shap, pickle, optuna
from IPython.display import display
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.inspection import permutation_importance
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_rows',200)

warnings.filterwarnings("ignore")

# target encoding code:

### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded


In [18]:
def TargetEncoderMP(train_set, test_set, feature_cols, target_col, M=5):
    
    """ This function implements terget encoding on train-test split with 2-fold CV
    It relies on previously defined CrossFoldEncoder, stolen from somehwere on SO.
    m parameter controls smoothing and is defaulted at 5 """
    
    encoder = CrossFoldEncoder(MEstimateEncoder, m=M)
    train_encoded = encoder.fit_transform(train_set, train_set[target_col], cols=feature_cols)
    test_encoded = encoder.transform(test_set)

    train_set.drop(columns=feature_cols, inplace=True)
    test_set.drop(columns=feature_cols, inplace=True)
    train_set = pd.concat([train_set, train_encoded], axis = 1)
    test_set = pd.concat([test_set, test_encoded], axis = 1)
    
    return ([train_set, test_set])
    

In [19]:
# 1. Load data #

train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv',
                   skiprows=lambda i: i>0 and random.random() > 0.1)
pred = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
display(train.shape, train.head())

display(train.target.value_counts())
num_cols = [col for col in train.columns if train[col].nunique()>10]
num_cols.remove('f_27')
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
print('num_cols: ', num_cols, '\n', 'cat_cols: ', cat_cols)
display(train[num_cols].describe())
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
display([train[col].value_counts() for col in cat_cols])

#[train[col].value_counts() for col in train.columns if train[col].nunique()<10]
#display(train.count())

(90540, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,8,0.866221,0.842351,-1.367634,1.237506,0.682962,0.845666,0.173852,3,2,1,7,4,3,5,2,3,0,0,4,-2.56893,-0.796303,-0.786741,-1.241121,-1.245038,1.696402,0.354545,-0.541267,ACAEBADDAA,-52.223857,0,0,0
1,18,0.922494,-0.828627,0.953987,0.835863,0.112589,2.432248,0.236815,0,3,3,1,2,1,2,5,1,1,3,2,0.594356,-0.983587,-0.789391,4.188384,-2.615815,-2.015672,7.466349,6.045183,AGBCBACCBA,448.896335,0,2,0
2,21,-0.645481,1.060355,-1.213576,-1.319743,-0.032761,0.868289,0.082763,2,3,4,1,2,3,2,0,9,1,2,3,-3.408796,3.370601,0.445238,-0.305235,-4.579724,-0.982572,1.438014,-1.00872,BBADDAECDC,143.101347,1,1,1
3,27,-1.814636,-1.578055,-0.164912,1.02449,0.810403,1.94986,1.704441,1,1,3,4,1,3,5,1,0,1,3,4,1.247929,-3.367163,0.97469,-0.255284,0.086702,-2.260849,0.98156,-3.27038,ACABCBCRCC,36.856304,1,1,0
4,39,0.916766,-0.468947,-0.255398,-0.569036,0.193245,0.77535,-0.274236,2,2,6,0,1,7,3,1,1,7,3,5,0.593269,-1.350143,-4.503929,-0.129318,-1.132656,0.588172,-3.338356,0.478027,BCBDAABGCF,-262.819456,1,1,0


0    46675
1    43865
Name: target, dtype: int64

num_cols:  ['id', 'f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28'] 
 cat_cols:  ['f_30', 'f_29', 'f_27']


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28
count,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0,90540.0
mean,449008.769903,-0.001078,0.004748,0.001573,-0.001465,-0.004293,0.00189,-0.004092,2.031047,2.063044,2.362889,2.176463,1.801469,2.848918,2.232483,1.506174,2.093749,2.09979,1.863773,2.070886,0.305905,-0.181463,-0.159229,-0.029894,-0.361611,-0.341586,0.17573,0.35728,-1.195224
std,259707.689482,1.000146,0.998448,1.002224,1.001003,1.002941,1.000911,1.004206,1.655558,1.594261,1.635472,1.648631,1.53641,1.765622,1.535558,1.358467,1.56634,1.562913,1.467218,1.573027,2.316888,2.395452,2.469125,2.452356,2.448831,2.393193,2.421398,2.464871,239.146084
min,8.0,-4.286109,-4.167642,-3.937882,-4.628484,-4.070449,-4.576952,-4.060522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-10.481084,-9.878739,-12.183785,-10.820862,-11.629187,-10.798768,-10.17382,-11.169623,-1157.166666
25%,225154.0,-0.676722,-0.677484,-0.67671,-0.675423,-0.684657,-0.669395,-0.680961,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,-1.236377,-1.806292,-1.817116,-1.662761,-2.010249,-1.966418,-1.430129,-1.25545,-160.271535
50%,447951.0,0.001841,0.006407,0.007071,-0.001586,-0.008212,-0.003147,-0.005699,2.0,2.0,2.0,2.0,2.0,3.0,2.0,1.0,2.0,2.0,2.0,2.0,0.333198,-0.185209,-0.157871,0.017744,-0.3922,-0.341565,0.162599,0.417487,-1.798168
75%,673742.0,0.675066,0.680002,0.678414,0.668399,0.674137,0.673176,0.671435,3.0,3.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,1.866994,1.428255,1.510876,1.631938,1.258636,1.263288,1.794457,2.02553,158.890966
max,899989.0,4.249785,4.013197,4.200609,4.230063,4.104314,4.060916,4.211428,13.0,13.0,13.0,14.0,13.0,14.0,12.0,14.0,12.0,12.0,12.0,13.0,12.079667,9.91448,11.527211,9.850519,10.860577,12.389844,10.273561,12.775398,1018.321466


[0    30514
 2    30278
 1    29748
 Name: f_30, dtype: int64,
 0    59138
 1    31402
 Name: f_29, dtype: int64,
 BCBBBBELCC    4
 BABBABCEBD    4
 BCBCCACIBD    4
 ACBCAACPBB    4
 BCBBBACPBD    4
              ..
 BDBEBABSDE    1
 BAAAAABFCC    1
 BBBBDABCCF    1
 ABBCABETCF    1
 BABACADRAC    1
 Name: f_27, Length: 88447, dtype: int64]

In [20]:
# 3. split data #

#train_test_split approach does not work when I use TE.

test_size = 0.1
train.reset_index(inplace=True, drop=True)
test_index = random.sample(list(train.index), int(test_size*train.shape[0]))
train_ = train.iloc[list(set(train.index)-set(test_index))]
test = train.iloc[test_index]
display(train.shape, test.shape, train.head(3), test.head(3))

train0, test0 = train.copy(), test.copy()

(90540, 33)

(9054, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,8,0.866221,0.842351,-1.367634,1.237506,0.682962,0.845666,0.173852,3,2,1,7,4,3,5,2,3,0,0,4,-2.56893,-0.796303,-0.786741,-1.241121,-1.245038,1.696402,0.354545,-0.541267,ACAEBADDAA,-52.223857,0,0,0
1,18,0.922494,-0.828627,0.953987,0.835863,0.112589,2.432248,0.236815,0,3,3,1,2,1,2,5,1,1,3,2,0.594356,-0.983587,-0.789391,4.188384,-2.615815,-2.015672,7.466349,6.045183,AGBCBACCBA,448.896335,0,2,0
2,21,-0.645481,1.060355,-1.213576,-1.319743,-0.032761,0.868289,0.082763,2,3,4,1,2,3,2,0,9,1,2,3,-3.408796,3.370601,0.445238,-0.305235,-4.579724,-0.982572,1.438014,-1.00872,BBADDAECDC,143.101347,1,1,1


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
29168,289957,-0.148378,0.548169,0.829241,0.802299,1.274497,0.114888,1.61088,0,5,2,0,1,0,1,2,2,3,1,1,-1.383431,-2.436701,0.738691,-0.35718,3.859422,-0.357368,-1.819748,-0.173728,AEACCADOCC,513.106852,0,0,1
3141,30286,-0.257653,-0.239395,-1.265026,-1.11995,-0.521108,0.613049,1.392635,0,2,0,0,3,1,0,1,2,1,3,2,3.095569,1.966354,-1.91391,-4.345637,-5.080232,-4.358149,-3.480443,-5.962773,ABBCAAAPGE,-238.048664,0,0,0
16248,160966,0.578058,0.435574,1.036967,0.393754,1.7951,-0.53876,0.700666,1,3,3,3,1,1,1,2,0,0,4,1,-1.120046,-2.191985,0.597896,-1.264482,-0.066934,-1.922217,0.838191,3.240245,ABBBABCEBB,443.892115,0,0,1


In [21]:
# 5. FE #

# first do TE

train, test = train0.copy(), test0.copy()

display(train.head(), test.head())
_, pred = TargetEncoderMP(train, pred, ['f_27'], 'target')
train, test = TargetEncoderMP(train, test, ['f_27'], 'target')
display(train.head(), test.head(), pred.head())

# then extract a target

X_train = train.copy()
y_train = X_train.pop('target')
X_test = test.copy()
y_test = X_test.pop('target')

# then do OHE with columntransformer

ohe_cols = ['f_29', 'f_30']
feature_transformer = ColumnTransformer([
   ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), ohe_cols)],
   remainder="passthrough")
print('Number of features before transaformation: ', X_train.shape)
X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
pred = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
display(X_train.head(), X_test.head(), pred.head())


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,8,0.866221,0.842351,-1.367634,1.237506,0.682962,0.845666,0.173852,3,2,1,7,4,3,5,2,3,0,0,4,-2.56893,-0.796303,-0.786741,-1.241121,-1.245038,1.696402,0.354545,-0.541267,ACAEBADDAA,-52.223857,0,0,0
1,18,0.922494,-0.828627,0.953987,0.835863,0.112589,2.432248,0.236815,0,3,3,1,2,1,2,5,1,1,3,2,0.594356,-0.983587,-0.789391,4.188384,-2.615815,-2.015672,7.466349,6.045183,AGBCBACCBA,448.896335,0,2,0
2,21,-0.645481,1.060355,-1.213576,-1.319743,-0.032761,0.868289,0.082763,2,3,4,1,2,3,2,0,9,1,2,3,-3.408796,3.370601,0.445238,-0.305235,-4.579724,-0.982572,1.438014,-1.00872,BBADDAECDC,143.101347,1,1,1
3,27,-1.814636,-1.578055,-0.164912,1.02449,0.810403,1.94986,1.704441,1,1,3,4,1,3,5,1,0,1,3,4,1.247929,-3.367163,0.97469,-0.255284,0.086702,-2.260849,0.98156,-3.27038,ACABCBCRCC,36.856304,1,1,0
4,39,0.916766,-0.468947,-0.255398,-0.569036,0.193245,0.77535,-0.274236,2,2,6,0,1,7,3,1,1,7,3,5,0.593269,-1.350143,-4.503929,-0.129318,-1.132656,0.588172,-3.338356,0.478027,BCBDAABGCF,-262.819456,1,1,0


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
29168,289957,-0.148378,0.548169,0.829241,0.802299,1.274497,0.114888,1.61088,0,5,2,0,1,0,1,2,2,3,1,1,-1.383431,-2.436701,0.738691,-0.35718,3.859422,-0.357368,-1.819748,-0.173728,AEACCADOCC,513.106852,0,0,1
3141,30286,-0.257653,-0.239395,-1.265026,-1.11995,-0.521108,0.613049,1.392635,0,2,0,0,3,1,0,1,2,1,3,2,3.095569,1.966354,-1.91391,-4.345637,-5.080232,-4.358149,-3.480443,-5.962773,ABBCAAAPGE,-238.048664,0,0,0
16248,160966,0.578058,0.435574,1.036967,0.393754,1.7951,-0.53876,0.700666,1,3,3,3,1,1,1,2,0,0,4,1,-1.120046,-2.191985,0.597896,-1.264482,-0.066934,-1.922217,0.838191,3.240245,ABBBABCEBB,443.892115,0,0,1
56194,557102,0.119832,-1.140194,0.259692,0.768091,0.435252,-0.705126,-0.101193,2,4,4,2,1,1,4,1,7,6,3,6,-0.670141,-2.160991,0.812893,0.443669,-0.593305,-1.31924,0.505617,-1.560496,BABBAAETDB,-23.346895,1,1,1
7902,76957,0.186579,0.635994,-1.792145,-0.002432,0.991567,1.208961,0.261658,4,2,3,3,2,4,1,2,2,3,2,0,0.886578,0.710126,2.281544,-4.812871,-1.067746,3.884482,-2.182611,6.669902,BBBDBABDDE,154.033815,1,0,1


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
0,8,0.866221,0.842351,-1.367634,1.237506,0.682962,0.845666,0.173852,3,2,1,7,4,3,5,2,3,0,0,4,-2.56893,-0.796303,-0.786741,-1.241121,-1.245038,1.696402,0.354545,-0.541267,-52.223857,0,0,0,0.484221
1,18,0.922494,-0.828627,0.953987,0.835863,0.112589,2.432248,0.236815,0,3,3,1,2,1,2,5,1,1,3,2,0.594356,-0.983587,-0.789391,4.188384,-2.615815,-2.015672,7.466349,6.045183,448.896335,0,2,0,0.484221
2,21,-0.645481,1.060355,-1.213576,-1.319743,-0.032761,0.868289,0.082763,2,3,4,1,2,3,2,0,9,1,2,3,-3.408796,3.370601,0.445238,-0.305235,-4.579724,-0.982572,1.438014,-1.00872,143.101347,1,1,1,0.484221
3,27,-1.814636,-1.578055,-0.164912,1.02449,0.810403,1.94986,1.704441,1,1,3,4,1,3,5,1,0,1,3,4,1.247929,-3.367163,0.97469,-0.255284,0.086702,-2.260849,0.98156,-3.27038,36.856304,1,1,0,0.484221
4,39,0.916766,-0.468947,-0.255398,-0.569036,0.193245,0.77535,-0.274236,2,2,6,0,1,7,3,1,1,7,3,5,0.593269,-1.350143,-4.503929,-0.129318,-1.132656,0.588172,-3.338356,0.478027,-262.819456,1,1,0,0.484221


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
29168,289957,-0.148378,0.548169,0.829241,0.802299,1.274497,0.114888,1.61088,0,5,2,0,1,0,1,2,2,3,1,1,-1.383431,-2.436701,0.738691,-0.35718,3.859422,-0.357368,-1.819748,-0.173728,513.106852,0,0,1,0.548925
3141,30286,-0.257653,-0.239395,-1.265026,-1.11995,-0.521108,0.613049,1.392635,0,2,0,0,3,1,0,1,2,1,3,2,3.095569,1.966354,-1.91391,-4.345637,-5.080232,-4.358149,-3.480443,-5.962773,-238.048664,0,0,0,0.423911
16248,160966,0.578058,0.435574,1.036967,0.393754,1.7951,-0.53876,0.700666,1,3,3,3,1,1,1,2,0,0,4,1,-1.120046,-2.191985,0.597896,-1.264482,-0.066934,-1.922217,0.838191,3.240245,443.892115,0,0,1,0.487789
56194,557102,0.119832,-1.140194,0.259692,0.768091,0.435252,-0.705126,-0.101193,2,4,4,2,1,1,4,1,7,6,3,6,-0.670141,-2.160991,0.812893,0.443669,-0.593305,-1.31924,0.505617,-1.560496,-23.346895,1,1,1,0.548876
7902,76957,0.186579,0.635994,-1.792145,-0.002432,0.991567,1.208961,0.261658,4,2,3,3,2,4,1,2,2,3,2,0,0.886578,0.710126,2.281544,-4.812871,-1.067746,3.884482,-2.182611,6.669902,154.033815,1,0,1,0.548911


Number of features before transaformation:  (90540, 32)


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,1.0,0.0,0.0,8.0,0.866221,0.842351,-1.367634,1.237506,0.682962,0.845666,0.173852,3.0,2.0,1.0,7.0,4.0,3.0,5.0,2.0,3.0,0.0,0.0,4.0,-2.56893,-0.796303,-0.786741,-1.241121,-1.245038,1.696402,0.354545,-0.541267,-52.223857,0.484221
1,1.0,0.0,0.0,0.0,1.0,18.0,0.922494,-0.828627,0.953987,0.835863,0.112589,2.432248,0.236815,0.0,3.0,3.0,1.0,2.0,1.0,2.0,5.0,1.0,1.0,3.0,2.0,0.594356,-0.983587,-0.789391,4.188384,-2.615815,-2.015672,7.466349,6.045183,448.896335,0.484221
2,0.0,1.0,0.0,1.0,0.0,21.0,-0.645481,1.060355,-1.213576,-1.319743,-0.032761,0.868289,0.082763,2.0,3.0,4.0,1.0,2.0,3.0,2.0,0.0,9.0,1.0,2.0,3.0,-3.408796,3.370601,0.445238,-0.305235,-4.579724,-0.982572,1.438014,-1.00872,143.101347,0.484221
3,0.0,1.0,0.0,1.0,0.0,27.0,-1.814636,-1.578055,-0.164912,1.02449,0.810403,1.94986,1.704441,1.0,1.0,3.0,4.0,1.0,3.0,5.0,1.0,0.0,1.0,3.0,4.0,1.247929,-3.367163,0.97469,-0.255284,0.086702,-2.260849,0.98156,-3.27038,36.856304,0.484221
4,0.0,1.0,0.0,1.0,0.0,39.0,0.916766,-0.468947,-0.255398,-0.569036,0.193245,0.77535,-0.274236,2.0,2.0,6.0,0.0,1.0,7.0,3.0,1.0,1.0,7.0,3.0,5.0,0.593269,-1.350143,-4.503929,-0.129318,-1.132656,0.588172,-3.338356,0.478027,-262.819456,0.484221


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,1.0,0.0,0.0,289957.0,-0.148378,0.548169,0.829241,0.802299,1.274497,0.114888,1.61088,0.0,5.0,2.0,0.0,1.0,0.0,1.0,2.0,2.0,3.0,1.0,1.0,-1.383431,-2.436701,0.738691,-0.35718,3.859422,-0.357368,-1.819748,-0.173728,513.106852,0.548925
1,1.0,0.0,1.0,0.0,0.0,30286.0,-0.257653,-0.239395,-1.265026,-1.11995,-0.521108,0.613049,1.392635,0.0,2.0,0.0,0.0,3.0,1.0,0.0,1.0,2.0,1.0,3.0,2.0,3.095569,1.966354,-1.91391,-4.345637,-5.080232,-4.358149,-3.480443,-5.962773,-238.048664,0.423911
2,1.0,0.0,1.0,0.0,0.0,160966.0,0.578058,0.435574,1.036967,0.393754,1.7951,-0.53876,0.700666,1.0,3.0,3.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,4.0,1.0,-1.120046,-2.191985,0.597896,-1.264482,-0.066934,-1.922217,0.838191,3.240245,443.892115,0.487789
3,0.0,1.0,0.0,1.0,0.0,557102.0,0.119832,-1.140194,0.259692,0.768091,0.435252,-0.705126,-0.101193,2.0,4.0,4.0,2.0,1.0,1.0,4.0,1.0,7.0,6.0,3.0,6.0,-0.670141,-2.160991,0.812893,0.443669,-0.593305,-1.31924,0.505617,-1.560496,-23.346895,0.548876
4,0.0,1.0,1.0,0.0,0.0,76957.0,0.186579,0.635994,-1.792145,-0.002432,0.991567,1.208961,0.261658,4.0,2.0,3.0,3.0,2.0,4.0,1.0,2.0,2.0,3.0,2.0,0.0,0.886578,0.710126,2.281544,-4.812871,-1.067746,3.884482,-2.182611,6.669902,154.033815,0.548911


In [22]:
# 6. fit XGBoost #

time1 = time.time()
xgb = XGBClassifier(n_estimators=100, max_depth=6, eta=0.1)
xgb.fit(X_train, y_train)
display(time.time()-time1)


display('Accuracy: ', accuracy_score(y_train,xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,xgb.predict(X_train)))
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,xgb.predict(X_test)))


40.41128444671631

'Accuracy: '

0.8439253368676828

'F1 score: '

0.8362059972413154

'Recall score: '

0.8223184771457882

'Precision score: '

0.8505706470477269

'Accuracy: '

0.8688977247625359

'F1 score: '

0.866313774073657

'Recall score: '

0.8594413407821229

'Precision score: '

0.8732970027247956

In [23]:
# optuna hyperparameter optimization

time1 = time.time()

def objective(trial, n_splits=2, n_jobs=-1, scale_pos_weight=1, early_stopping_rounds=50):

    cv_regularizer=0.0
    # Usually values between 0.1 and 0.2 work fine.

    params = {
        "tree_method": 'gpu_hist',
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "n_estimators": trial.suggest_int("n_estimators", 100, 700),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "learning_rate": trial.suggest_uniform("learning_rate", 0.02, 0.3),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 0.95),
        "subsample": trial.suggest_uniform("subsample", 0.5, 0.95),
        "alpha": trial.suggest_loguniform("alpha", 0.1, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 0.1, 100.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-10, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 10),
        "n_jobs": n_jobs,
    }

    X = X_train
    y = y_train

    model = XGBClassifier(**params)
    rkf = KFold(n_splits=n_splits, shuffle=True)
    X_values = X.values
    y_values = y.values
    y_pred = np.zeros_like(y_values)
    y_pred_train = np.zeros_like(y_values)
    for train_index, test_index in rkf.split(X_values):
        X_A, X_B = X_values[train_index, :], X_values[test_index, :]
        y_A, y_B = y_values[train_index], y_values[test_index]
        model.fit(X_A, y_A, eval_set=[(X_B, y_B)],
                  early_stopping_rounds=early_stopping_rounds, verbose = False)
        y_pred[test_index] += model.predict(X_B)
        y_pred_train[train_index] += model.predict(X_A)
    score_train = f1_score(y_train, y_pred_train)
    score_test = f1_score(y_train, y_pred) 
    overfit = score_train-score_test
    #return (f1_score_test)
    return (score_test-cv_regularizer*overfit)


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)
print('Total time for hypermarameter optimization ', time.time()-time1)
hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")

optuna_hyperpars = study.best_params
optuna_hyperpars['tree_method']='gpu_hist'
optuna_hyperpars['scale_pos_weight']=1
#optuna_hyperpars['early_stopping_rounds']=50

optuna_xgb = XGBClassifier(**optuna_hyperpars)
optuna_xgb.fit(X_train, y_train)


[32m[I 2022-07-21 20:07:12,144][0m A new study created in memory with name: no-name-d29071e3-8349-4b25-9c3e-417b02286ef6[0m
[32m[I 2022-07-21 20:07:27,341][0m Trial 0 finished with value: 0.826227249239249 and parameters: {'n_estimators': 636, 'max_depth': 9, 'learning_rate': 0.1914385445454791, 'colsample_bytree': 0.749142250718337, 'subsample': 0.7113455166216284, 'alpha': 1.5215215742410007, 'lambda': 18.107204935645516, 'gamma': 4.092383497351642e-08, 'min_child_weight': 0.10513270778208554}. Best is trial 0 with value: 0.826227249239249.[0m
[32m[I 2022-07-21 20:07:35,311][0m Trial 1 finished with value: 0.8293636196431855 and parameters: {'n_estimators': 630, 'max_depth': 6, 'learning_rate': 0.17560762307649327, 'colsample_bytree': 0.7894866578795885, 'subsample': 0.8641063093497012, 'alpha': 3.8905180758963436, 'lambda': 42.54006109882384, 'gamma': 2.0032768111499814e-05, 'min_child_weight': 2.66504730571528}. Best is trial 1 with value: 0.8293636196431855.[0m
[32m[I 20

Total time for hypermarameter optimization  419.4376940727234
        n_estimators : 543
           max_depth : 10
       learning_rate : 0.05367627363372013
    colsample_bytree : 0.812745702543195
           subsample : 0.7481926902521382
               alpha : 2.737048067746892
              lambda : 8.44524988424516
               gamma : 0.04239413588771567
    min_child_weight : 8.158149051817619
best objective value : 0.8374444227433048


XGBClassifier(alpha=2.737048067746892, base_score=0.5, booster='gbtree',
              callbacks=None, colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.812745702543195, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None,
              gamma=0.04239413588771567, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              lambda=8.44524988424516, learning_rate=0.05367627363372013,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=10,
              max_leaves=0, min_child_weight=8.158149051817619, missing=nan,
              monotone_constraints='()', n_estimators=543, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [24]:
display('Accuracy: ', accuracy_score(y_train,optuna_xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,optuna_xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,optuna_xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,optuna_xgb.predict(X_train)))
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,optuna_xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,optuna_xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,optuna_xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,optuna_xgb.predict(X_test)))

'Accuracy: '

0.9386127678374199

'F1 score: '

0.9362804668332836

'Recall score: '

0.9309016300011399

'Precision score: '

0.9417218237586772

'Accuracy: '

0.9652087475149106

'F1 score: '

0.9644027573737145

'Recall score: '

0.9535195530726257

'Precision score: '

0.9755372656607224

In [25]:
# 7. Do FI analysisany