Outline:
1. Load evth.
2. Preclean data.
3. Train-test split.
4. Missing values.
5. FE.
6. Modeling.
7. FI.
8. predictions.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os, time, warnings, gzip, gc, random, math, shap, pickle, optuna
from IPython.display import display
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.inspection import permutation_importance
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_rows',200)

warnings.filterwarnings("ignore")

# target encoding code:

### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded


In [2]:
def TargetEncoderMP(train_set, test_set, feature_cols, target_col, M=5):
    
    """ This function implements terget encoding on train-test split with 2-fold CV
    It relies on previously defined CrossFoldEncoder, stolen from somehwere on SO.
    m parameter controls smoothing and is defaulted at 5 """
    
    encoder = CrossFoldEncoder(MEstimateEncoder, m=M)
    train_encoded = encoder.fit_transform(train_set, train_set[target_col], cols=feature_cols)
    test_encoded = encoder.transform(test_set)

    train_set.drop(columns=feature_cols, inplace=True)
    test_set.drop(columns=feature_cols, inplace=True)
    train_set = pd.concat([train_set, train_encoded], axis = 1)
    test_set = pd.concat([test_set, test_encoded], axis = 1)
    
    return ([train_set, test_set])
    

In [3]:
# 1. Load data #

train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv',
                   skiprows=lambda i: i>0 and random.random() > 0.1)
display(train.shape, train.head())

display(train.target.value_counts())
num_cols = [col for col in train.columns if train[col].nunique()>10]
num_cols.remove('f_27')
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
print('num_cols: ', num_cols, '\n', 'cat_cols: ', cat_cols)
display(train[num_cols].describe())
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
display([train[col].value_counts() for col in cat_cols])

#[train[col].value_counts() for col in train.columns if train[col].nunique()<10]
#display(train.count())

(89944, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,14,0.126395,0.887744,1.522033,0.378453,1.372189,-0.225767,1.131681,3,2,1,5,1,0,8,1,0,0,0,3,2.562544,3.820487,0.831654,-1.748326,-0.501451,-1.334737,0.159523,-1.820546,BAACBBDDAC,549.135554,0,1,1
1,60,0.870784,0.111933,-2.372691,0.0472,1.302467,0.264599,0.297627,4,3,0,1,0,2,2,1,5,6,0,2,1.363273,-1.800639,-1.858568,-1.526342,-0.186489,-2.213499,1.208514,-1.10494,BBABEAECEA,255.295669,0,2,1
2,65,-0.5926,-0.946231,-1.41425,-0.564863,-0.057431,-1.551445,0.017653,2,5,5,1,1,5,0,1,2,1,3,0,2.784968,0.210196,-3.110553,-1.573672,0.764691,-0.664031,-4.765627,-7.918127,ABBCBACQBC,-96.954048,0,0,0
3,78,-0.45264,0.639057,1.508553,0.936094,-0.925203,0.945437,-0.38273,2,3,2,1,1,2,2,0,6,3,3,2,1.142594,3.420914,0.908999,-5.435368,-0.980417,0.557273,-0.64188,0.850362,BABEEBCFFB,17.042381,1,1,1
4,86,-1.429118,-0.931012,-2.127887,-0.098724,-0.649361,-0.188876,0.117951,2,0,1,2,1,3,1,1,6,4,0,1,0.139332,3.519236,-4.174904,-1.669505,0.92344,-2.358893,-0.427311,2.399393,ABACCBCEEA,-182.406333,0,1,0


0    46022
1    43922
Name: target, dtype: int64

num_cols:  ['id', 'f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28'] 
 cat_cols:  ['f_27', 'f_30', 'f_29']


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28
count,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0,89944.0
mean,449991.850685,-0.009919,0.004096,0.006722,-0.00151,-0.001657,0.002865,0.006601,2.033443,2.057558,2.360358,2.173875,1.808114,2.832918,2.238315,1.516677,2.101463,2.102508,1.853987,2.062628,0.319945,-0.184406,-0.16575,0.003694,-0.358039,-0.338254,0.179779,0.342045,0.726072
std,259399.135894,0.995306,0.998469,1.001358,0.998888,1.001848,1.004141,0.999048,1.656555,1.596483,1.637895,1.643194,1.533758,1.762663,1.542712,1.35803,1.565843,1.56158,1.468106,1.563605,2.301971,2.398822,2.480146,2.457911,2.447349,2.389702,2.414206,2.474733,238.615944
min,14.0,-4.171053,-4.323802,-4.1212,-4.628484,-4.3734,-4.10243,-4.838879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-9.300771,-11.257917,-10.497804,-10.757361,-11.629187,-11.380817,-11.918306,-11.213371,-1118.6456
25%,225808.5,-0.682567,-0.671367,-0.670988,-0.677988,-0.678429,-0.675874,-0.66819,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,-1.216519,-1.808324,-1.829953,-1.640153,-2.001562,-1.942311,-1.428405,-1.284274,-158.141086
50%,449830.5,-0.01046,0.009427,0.003651,-0.001896,-0.003719,0.002036,0.011798,2.0,2.0,2.0,2.0,2.0,3.0,2.0,1.0,2.0,2.0,2.0,2.0,0.341192,-0.195542,-0.153019,0.030981,-0.369793,-0.326065,0.1597,0.381655,0.162727
75%,673802.75,0.662148,0.675679,0.684041,0.673243,0.677055,0.679634,0.680542,3.0,3.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,1.878917,1.432375,1.50845,1.688972,1.261364,1.267711,1.789026,2.019776,159.856655
max,899984.0,4.017473,4.702502,3.944027,4.45492,4.34818,4.383716,4.822668,15.0,14.0,12.0,12.0,13.0,13.0,11.0,11.0,12.0,11.0,12.0,12.0,10.281401,11.475325,11.679436,10.502184,10.300538,9.647453,9.934249,10.312991,1189.545831


[ADBBBABTBC    5
 AEBBBABQCD    4
 BBAABADIBB    4
 ABBBBAAKBD    4
 ACBAAAALDC    4
              ..
 AFADBAECEC    1
 ADBBABBLAE    1
 ACBCAAAEBE    1
 ABBCBABIAB    1
 BABBBBBBBC    1
 Name: f_27, Length: 87914, dtype: int64,
 2    30295
 0    30172
 1    29477
 Name: f_30, dtype: int64,
 0    58595
 1    31349
 Name: f_29, dtype: int64]

In [4]:
# 3. split data #

#train_test_split approach does not work when I use TE.

test_size = 0.1
train.reset_index(inplace=True, drop=True)
test_index = random.sample(list(train.index), int(test_size*train.shape[0]))
train_ = train.iloc[list(set(train.index)-set(test_index))]
test = train.iloc[test_index]
display(train.shape, test.shape, train.head(3), test.head(3))

train0, test0 = train.copy(), test.copy()

(89944, 33)

(8994, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,14,0.126395,0.887744,1.522033,0.378453,1.372189,-0.225767,1.131681,3,2,1,5,1,0,8,1,0,0,0,3,2.562544,3.820487,0.831654,-1.748326,-0.501451,-1.334737,0.159523,-1.820546,BAACBBDDAC,549.135554,0,1,1
1,60,0.870784,0.111933,-2.372691,0.0472,1.302467,0.264599,0.297627,4,3,0,1,0,2,2,1,5,6,0,2,1.363273,-1.800639,-1.858568,-1.526342,-0.186489,-2.213499,1.208514,-1.10494,BBABEAECEA,255.295669,0,2,1
2,65,-0.5926,-0.946231,-1.41425,-0.564863,-0.057431,-1.551445,0.017653,2,5,5,1,1,5,0,1,2,1,3,0,2.784968,0.210196,-3.110553,-1.573672,0.764691,-0.664031,-4.765627,-7.918127,ABBCBACQBC,-96.954048,0,0,0


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
64852,647731,-0.528032,0.387955,1.316136,0.208387,0.693731,0.966846,0.177412,2,5,3,1,2,5,1,0,3,2,3,5,3.198583,4.878173,5.211466,-2.333863,-2.078104,3.085743,4.748978,4.34879,ACBABAAADC,-85.34573,0,1,1
74660,746658,-0.185579,0.002287,-1.862446,-1.755264,0.036781,-0.965277,0.863652,0,4,3,0,1,4,1,2,4,1,1,2,1.807302,5.081426,-2.221318,2.162374,-3.701266,-1.973977,2.023197,-5.403082,AEBAAABFEB,65.114149,0,1,0
82379,823719,-1.405858,-0.625671,1.377267,-0.426402,-0.589901,0.064917,0.054886,7,3,4,4,3,2,1,2,0,5,2,6,-0.267556,-0.071308,2.823644,2.064331,-1.14877,2.615817,-2.726125,1.737848,BCACCACNCC,192.736299,1,1,0


In [5]:
# 5. FE #

# first do TE

train, test = train0.copy(), test0.copy()

display(train.head(), test.head())
train, test = TargetEncoderMP(train, test, ['f_27'], 'target')
display(train.head(), test.head())

# then extract a target

X_train = train.copy()
y_train = X_train.pop('target')
X_test = test.copy()
y_test = X_test.pop('target')

# then do OHE with columntransformer

ohe_cols = ['f_29', 'f_30']
feature_transformer = ColumnTransformer([
   ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), ohe_cols)],
   remainder="passthrough")
print('Number of features before transaformation: ', X_train.shape)
X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
display(X_train.head(), X_test.head())


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,14,0.126395,0.887744,1.522033,0.378453,1.372189,-0.225767,1.131681,3,2,1,5,1,0,8,1,0,0,0,3,2.562544,3.820487,0.831654,-1.748326,-0.501451,-1.334737,0.159523,-1.820546,BAACBBDDAC,549.135554,0,1,1
1,60,0.870784,0.111933,-2.372691,0.0472,1.302467,0.264599,0.297627,4,3,0,1,0,2,2,1,5,6,0,2,1.363273,-1.800639,-1.858568,-1.526342,-0.186489,-2.213499,1.208514,-1.10494,BBABEAECEA,255.295669,0,2,1
2,65,-0.5926,-0.946231,-1.41425,-0.564863,-0.057431,-1.551445,0.017653,2,5,5,1,1,5,0,1,2,1,3,0,2.784968,0.210196,-3.110553,-1.573672,0.764691,-0.664031,-4.765627,-7.918127,ABBCBACQBC,-96.954048,0,0,0
3,78,-0.45264,0.639057,1.508553,0.936094,-0.925203,0.945437,-0.38273,2,3,2,1,1,2,2,0,6,3,3,2,1.142594,3.420914,0.908999,-5.435368,-0.980417,0.557273,-0.64188,0.850362,BABEEBCFFB,17.042381,1,1,1
4,86,-1.429118,-0.931012,-2.127887,-0.098724,-0.649361,-0.188876,0.117951,2,0,1,2,1,3,1,1,6,4,0,1,0.139332,3.519236,-4.174904,-1.669505,0.92344,-2.358893,-0.427311,2.399393,ABACCBCEEA,-182.406333,0,1,0


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
64852,647731,-0.528032,0.387955,1.316136,0.208387,0.693731,0.966846,0.177412,2,5,3,1,2,5,1,0,3,2,3,5,3.198583,4.878173,5.211466,-2.333863,-2.078104,3.085743,4.748978,4.34879,ACBABAAADC,-85.34573,0,1,1
74660,746658,-0.185579,0.002287,-1.862446,-1.755264,0.036781,-0.965277,0.863652,0,4,3,0,1,4,1,2,4,1,1,2,1.807302,5.081426,-2.221318,2.162374,-3.701266,-1.973977,2.023197,-5.403082,AEBAAABFEB,65.114149,0,1,0
82379,823719,-1.405858,-0.625671,1.377267,-0.426402,-0.589901,0.064917,0.054886,7,3,4,4,3,2,1,2,0,5,2,6,-0.267556,-0.071308,2.823644,2.064331,-1.14877,2.615817,-2.726125,1.737848,BCACCACNCC,192.736299,1,1,0
21816,218972,-2.104047,0.538019,0.593648,-1.784599,-0.131712,-2.694481,-0.916912,1,1,3,1,4,4,1,0,2,0,1,0,1.177969,-1.61555,-3.309689,2.233368,0.078751,-1.395136,-0.324348,-1.83833,AGBBCAFLBD,-638.699174,0,2,1
34308,343898,-1.048022,-0.382459,2.820161,1.567454,1.11046,-0.285399,1.360149,4,3,1,0,3,3,4,1,3,2,1,0,-2.974381,2.58096,-1.027097,-2.601795,0.004346,-0.755262,-0.111948,2.037621,BBBAABDBDC,303.689154,0,1,0


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
0,14,0.126395,0.887744,1.522033,0.378453,1.372189,-0.225767,1.131681,3,2,1,5,1,0,8,1,0,0,0,3,2.562544,3.820487,0.831654,-1.748326,-0.501451,-1.334737,0.159523,-1.820546,549.135554,0,1,1,0.488081
1,60,0.870784,0.111933,-2.372691,0.0472,1.302467,0.264599,0.297627,4,3,0,1,0,2,2,1,5,6,0,2,1.363273,-1.800639,-1.858568,-1.526342,-0.186489,-2.213499,1.208514,-1.10494,255.295669,0,2,1,0.488081
2,65,-0.5926,-0.946231,-1.41425,-0.564863,-0.057431,-1.551445,0.017653,2,5,5,1,1,5,0,1,2,1,3,0,2.784968,0.210196,-3.110553,-1.573672,0.764691,-0.664031,-4.765627,-7.918127,-96.954048,0,0,0,0.488081
3,78,-0.45264,0.639057,1.508553,0.936094,-0.925203,0.945437,-0.38273,2,3,2,1,1,2,2,0,6,3,3,2,1.142594,3.420914,0.908999,-5.435368,-0.980417,0.557273,-0.64188,0.850362,17.042381,1,1,1,0.488081
4,86,-1.429118,-0.931012,-2.127887,-0.098724,-0.649361,-0.188876,0.117951,2,0,1,2,1,3,1,1,6,4,0,1,0.139332,3.519236,-4.174904,-1.669505,0.92344,-2.358893,-0.427311,2.399393,-182.406333,0,1,0,0.406735


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
64852,647731,-0.528032,0.387955,1.316136,0.208387,0.693731,0.966846,0.177412,2,5,3,1,2,5,1,0,3,2,3,5,3.198583,4.878173,5.211466,-2.333863,-2.078104,3.085743,4.748978,4.34879,-85.34573,0,1,1,0.604035
74660,746658,-0.185579,0.002287,-1.862446,-1.755264,0.036781,-0.965277,0.863652,0,4,3,0,1,4,1,2,4,1,1,2,1.807302,5.081426,-2.221318,2.162374,-3.701266,-1.973977,2.023197,-5.403082,65.114149,0,1,0,0.427333
82379,823719,-1.405858,-0.625671,1.377267,-0.426402,-0.589901,0.064917,0.054886,7,3,4,4,3,2,1,2,0,5,2,6,-0.267556,-0.071308,2.823644,2.064331,-1.14877,2.615817,-2.726125,1.737848,192.736299,1,1,0,0.427333
21816,218972,-2.104047,0.538019,0.593648,-1.784599,-0.131712,-2.694481,-0.916912,1,1,3,1,4,4,1,0,2,0,1,0,1.177969,-1.61555,-3.309689,2.233368,0.078751,-1.395136,-0.324348,-1.83833,-638.699174,0,2,1,0.552275
34308,343898,-1.048022,-0.382459,2.820161,1.567454,1.11046,-0.285399,1.360149,4,3,1,0,3,3,4,1,3,2,1,0,-2.974381,2.58096,-1.027097,-2.601795,0.004346,-0.755262,-0.111948,2.037621,303.689154,0,1,0,0.427273


Number of features before transaformation:  (89944, 32)


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,0.0,1.0,0.0,14.0,0.126395,0.887744,1.522033,0.378453,1.372189,-0.225767,1.131681,3.0,2.0,1.0,5.0,1.0,0.0,8.0,1.0,0.0,0.0,0.0,3.0,2.562544,3.820487,0.831654,-1.748326,-0.501451,-1.334737,0.159523,-1.820546,549.135554,0.488081
1,1.0,0.0,0.0,0.0,1.0,60.0,0.870784,0.111933,-2.372691,0.0472,1.302467,0.264599,0.297627,4.0,3.0,0.0,1.0,0.0,2.0,2.0,1.0,5.0,6.0,0.0,2.0,1.363273,-1.800639,-1.858568,-1.526342,-0.186489,-2.213499,1.208514,-1.10494,255.295669,0.488081
2,1.0,0.0,1.0,0.0,0.0,65.0,-0.5926,-0.946231,-1.41425,-0.564863,-0.057431,-1.551445,0.017653,2.0,5.0,5.0,1.0,1.0,5.0,0.0,1.0,2.0,1.0,3.0,0.0,2.784968,0.210196,-3.110553,-1.573672,0.764691,-0.664031,-4.765627,-7.918127,-96.954048,0.488081
3,0.0,1.0,0.0,1.0,0.0,78.0,-0.45264,0.639057,1.508553,0.936094,-0.925203,0.945437,-0.38273,2.0,3.0,2.0,1.0,1.0,2.0,2.0,0.0,6.0,3.0,3.0,2.0,1.142594,3.420914,0.908999,-5.435368,-0.980417,0.557273,-0.64188,0.850362,17.042381,0.488081
4,1.0,0.0,0.0,1.0,0.0,86.0,-1.429118,-0.931012,-2.127887,-0.098724,-0.649361,-0.188876,0.117951,2.0,0.0,1.0,2.0,1.0,3.0,1.0,1.0,6.0,4.0,0.0,1.0,0.139332,3.519236,-4.174904,-1.669505,0.92344,-2.358893,-0.427311,2.399393,-182.406333,0.406735


Unnamed: 0,cat__f_29_0,cat__f_29_1,cat__f_30_0,cat__f_30_1,cat__f_30_2,remainder__id,remainder__f_00,remainder__f_01,remainder__f_02,remainder__f_03,remainder__f_04,remainder__f_05,remainder__f_06,remainder__f_07,remainder__f_08,remainder__f_09,remainder__f_10,remainder__f_11,remainder__f_12,remainder__f_13,remainder__f_14,remainder__f_15,remainder__f_16,remainder__f_17,remainder__f_18,remainder__f_19,remainder__f_20,remainder__f_21,remainder__f_22,remainder__f_23,remainder__f_24,remainder__f_25,remainder__f_26,remainder__f_28,remainder__f_27_encoded
0,1.0,0.0,0.0,1.0,0.0,647731.0,-0.528032,0.387955,1.316136,0.208387,0.693731,0.966846,0.177412,2.0,5.0,3.0,1.0,2.0,5.0,1.0,0.0,3.0,2.0,3.0,5.0,3.198583,4.878173,5.211466,-2.333863,-2.078104,3.085743,4.748978,4.34879,-85.34573,0.604035
1,1.0,0.0,0.0,1.0,0.0,746658.0,-0.185579,0.002287,-1.862446,-1.755264,0.036781,-0.965277,0.863652,0.0,4.0,3.0,0.0,1.0,4.0,1.0,2.0,4.0,1.0,1.0,2.0,1.807302,5.081426,-2.221318,2.162374,-3.701266,-1.973977,2.023197,-5.403082,65.114149,0.427333
2,0.0,1.0,0.0,1.0,0.0,823719.0,-1.405858,-0.625671,1.377267,-0.426402,-0.589901,0.064917,0.054886,7.0,3.0,4.0,4.0,3.0,2.0,1.0,2.0,0.0,5.0,2.0,6.0,-0.267556,-0.071308,2.823644,2.064331,-1.14877,2.615817,-2.726125,1.737848,192.736299,0.427333
3,1.0,0.0,0.0,0.0,1.0,218972.0,-2.104047,0.538019,0.593648,-1.784599,-0.131712,-2.694481,-0.916912,1.0,1.0,3.0,1.0,4.0,4.0,1.0,0.0,2.0,0.0,1.0,0.0,1.177969,-1.61555,-3.309689,2.233368,0.078751,-1.395136,-0.324348,-1.83833,-638.699174,0.552275
4,1.0,0.0,0.0,1.0,0.0,343898.0,-1.048022,-0.382459,2.820161,1.567454,1.11046,-0.285399,1.360149,4.0,3.0,1.0,0.0,3.0,3.0,4.0,1.0,3.0,2.0,1.0,0.0,-2.974381,2.58096,-1.027097,-2.601795,0.004346,-0.755262,-0.111948,2.037621,303.689154,0.427273


In [6]:
# 6. fit XGBoost #

time1 = time.time()
xgb = XGBClassifier(n_estimators=100, max_depth=6, eta=0.1)
xgb.fit(X_train, y_train)
display(time.time()-time1)


display('Accuracy: ', accuracy_score(y_train,xgb.predict(X_train)))
display('F1 score: ', f1_score(y_train,xgb.predict(X_train)))
display('Recall score: ', recall_score(y_train,xgb.predict(X_train)))
display('Precision score: ', precision_score(y_train,xgb.predict(X_train)))
# Performance evaluation:
display('Accuracy: ', accuracy_score(y_test,xgb.predict(X_test)))
display('F1 score: ', f1_score(y_test,xgb.predict(X_test)))
display('Recall score: ', recall_score(y_test,xgb.predict(X_test)))
display('Precision score: ', precision_score(y_test,xgb.predict(X_test)))


41.50220060348511

'Accuracy: '

0.8430467846660145

'F1 score: '

0.8371797977001949

'Recall score: '

0.82630572378307

'Precision score: '

0.8483438909796405

'Accuracy: '

0.8912608405603736

'F1 score: '

0.8841232227488152

'Recall score: '

0.8557339449541285

'Precision score: '

0.9144607843137255

In [7]:
# optuna hyperparameter optimization




In [8]:
# 7. Do FI analysisany