Outline:
1. Load evth.
2. Preclean data.
3. Train-test split.
4. Missing values.
5. FE.
6. Modeling.
7. FI.
8. predictions.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os, time, warnings, gzip, gc, random, math, shap, pickle, optuna
from IPython.display import display
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.inspection import permutation_importance
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_rows',200)

warnings.filterwarnings("ignore")

# target encoding code:

### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded


In [2]:
def TargetEncoderMP(train_set, test_set, feature_cols, target_col, M=5):
    
    """ This function implements terget encoding on train-test split with 2-fold CV
    It relies on previously defined CrossFoldEncoder, stolen from somehwere on SO.
    m parameter controls smoothing and is defaulted at 5 """
    
    encoder = CrossFoldEncoder(MEstimateEncoder, m=M)
    train_encoded = encoder.fit_transform(train_set, train_set[target_col], cols=feature_cols)
    test_encoded = encoder.transform(test_set)

    train_set.drop(columns=feature_cols, inplace=True)
    test_set.drop(columns=feature_cols, inplace=True)
    train_set = pd.concat([train_set, train_encoded], axis = 1)
    test_set = pd.concat([test_set, test_encoded], axis = 1)
    
    return ([train_set, test_set])
    

In [3]:
# 1. Load data #

train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv',
                   skiprows=lambda i: i>0 and random.random() > 0.1)
display(train.shape, train.head())

display(train.target.value_counts())
num_cols = [col for col in train.columns if train[col].nunique()>10]
num_cols.remove('f_27')
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
print('num_cols: ', num_cols, '\n', 'cat_cols: ', cat_cols)
display(train[num_cols].describe())
cat_cols = list(set(train.columns) - set(num_cols) - set(['target']))
display([train[col].value_counts() for col in cat_cols])

#[train[col].value_counts() for col in train.columns if train[col].nunique()<10]
#display(train.count())

(90245, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,10,-0.552998,-0.340897,1.066008,0.820531,-1.651332,-0.325655,0.236907,2,2,2,2,2,1,3,1,2,0,1,3,1.791614,-0.075696,-2.238481,4.445503,-0.024057,2.940171,-1.730239,-1.741727,ADBBBBBSDC,-166.024405,0,1,1
1,62,-1.321287,-1.260372,0.579082,-1.816321,-0.114923,2.517207,-1.002934,1,2,5,4,2,4,1,0,2,3,2,4,-3.129782,-2.541379,-0.980057,1.539227,3.31967,-6.805451,-1.769838,-1.714182,BBADDBCJBE,-272.93109,0,0,1
2,67,0.504814,0.10184,0.761814,-1.830958,0.105315,-0.107404,-0.38741,4,7,3,1,2,3,3,2,1,2,3,2,-0.060139,1.687031,-5.556003,4.790562,-6.370943,0.382,5.253365,-1.861232,BABGCBBTAB,141.289169,0,1,1
3,80,-0.237795,0.021497,-0.468444,-0.723512,-1.489679,1.20031,0.321046,1,5,0,3,2,5,3,3,1,2,5,2,-1.403882,2.448212,1.289593,0.986755,-6.309739,0.454259,2.914856,1.522981,ADAGBBCKAD,-55.387164,1,2,1
4,82,0.163361,0.593798,-0.499568,-0.550191,-1.192901,0.462731,0.569567,5,1,3,1,2,3,2,1,2,1,0,0,2.55729,-2.862385,-0.055949,-2.142073,-0.152629,3.043954,-2.634945,0.530955,ADBBAABCAG,210.230707,0,0,1


0    46529
1    43716
Name: target, dtype: int64

num_cols:  ['id', 'f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28'] 
 cat_cols:  ['f_29', 'f_30', 'f_27']


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28
count,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0,90245.0
mean,450859.46417,0.002183,0.001442,-0.002434,-0.002044,-0.004988,-0.002821,-0.002647,2.03844,2.065854,2.366247,2.179589,1.805352,2.838263,2.249665,1.515818,2.104948,2.095606,1.8531,2.066984,0.307396,-0.173076,-0.147593,-0.018033,-0.376386,-0.343923,0.180232,0.364982,-2.555536
std,260629.792759,0.998735,0.998755,0.997776,1.001954,1.001163,0.99963,0.999438,1.662339,1.591949,1.637967,1.651494,1.539842,1.76413,1.542247,1.357992,1.572696,1.553798,1.474461,1.57196,2.307406,2.408146,2.484015,2.448967,2.444702,2.392639,2.420154,2.472122,239.06032
min,10.0,-4.599856,-4.284241,-4.059981,-4.582963,-4.293415,-4.023699,-4.05665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-10.498233,-10.238559,-11.778732,-11.741608,-10.982754,-10.394414,-11.199497,-10.824154,-1161.979234
25%,224301.0,-0.671147,-0.671413,-0.675714,-0.680373,-0.679483,-0.678301,-0.680124,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,-1.236085,-1.811983,-1.810654,-1.650385,-2.025689,-1.961173,-1.43495,-1.2589,-161.648822
50%,451315.0,0.004981,0.002041,0.000104,-0.000621,-0.003468,-0.00309,0.001179,2.0,2.0,2.0,2.0,2.0,3.0,2.0,1.0,2.0,2.0,2.0,2.0,0.331776,-0.192428,-0.141818,0.024383,-0.398956,-0.343183,0.154835,0.407997,-3.040718
75%,676671.0,0.675911,0.673616,0.670785,0.672953,0.66979,0.670889,0.67123,3.0,3.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,1.877735,1.460845,1.50796,1.642815,1.253018,1.270567,1.795532,2.034788,155.757858
max,899999.0,4.299185,4.18675,4.749191,4.015574,4.935162,4.971881,4.214725,12.0,14.0,13.0,12.0,12.0,14.0,12.0,14.0,13.0,12.0,12.0,11.0,11.156438,9.387647,11.071167,10.299849,12.07022,9.789678,12.529179,11.408268,1149.282769


[0    58726
 1    31519
 Name: f_29, dtype: int64,
 2    30208
 0    30033
 1    30004
 Name: f_30, dtype: int64,
 BCBBBACFCC    4
 BABBBBCDCD    4
 BAAAAABMCB    3
 BDBCABBNDB    3
 BCBCBBBTCB    3
              ..
 ACBBABCCBH    1
 ACBCAABCBG    1
 AAAEBBCBAF    1
 AFBCAACPBG    1
 BCAACADSCE    1
 Name: f_27, Length: 88153, dtype: int64]

In [4]:
# 3. split data #

#train_test_split approach does not work when I use TE.

test_size = 0.1
train.reset_index(inplace=True, drop=True)
test_index = random.sample(list(train.index), int(test_size*train.shape[0]))
train_ = train.iloc[list(set(train.index)-set(test_index))]
test = train.iloc[test_index]
display(train.shape, test.shape, train.head(3), test.head(3))

train0, test0 = train.copy(), test.copy()

(90245, 33)

(9024, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,10,-0.552998,-0.340897,1.066008,0.820531,-1.651332,-0.325655,0.236907,2,2,2,2,2,1,3,1,2,0,1,3,1.791614,-0.075696,-2.238481,4.445503,-0.024057,2.940171,-1.730239,-1.741727,ADBBBBBSDC,-166.024405,0,1,1
1,62,-1.321287,-1.260372,0.579082,-1.816321,-0.114923,2.517207,-1.002934,1,2,5,4,2,4,1,0,2,3,2,4,-3.129782,-2.541379,-0.980057,1.539227,3.31967,-6.805451,-1.769838,-1.714182,BBADDBCJBE,-272.93109,0,0,1
2,67,0.504814,0.10184,0.761814,-1.830958,0.105315,-0.107404,-0.38741,4,7,3,1,2,3,3,2,1,2,3,2,-0.060139,1.687031,-5.556003,4.790562,-6.370943,0.382,5.253365,-1.861232,BABGCBBTAB,141.289169,0,1,1


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
30693,306122,0.407043,-0.806163,-0.34728,0.901392,0.030624,1.021695,-1.538214,2,0,3,4,1,3,6,0,3,1,2,2,-1.253169,-0.398908,7.642284,-6.890171,4.232027,2.876797,-3.005297,1.626928,BCBCCBBAEA,131.539744,1,1,1
49647,496736,0.438078,-0.001046,0.993016,-1.157652,-0.832134,-1.768345,-2.262174,0,1,3,4,4,4,3,2,5,2,2,3,-0.353976,-0.519648,-1.963529,-0.418211,-1.148162,-7.600216,-0.368006,1.62161,ABBEAADHCF,-269.542892,0,2,1
16096,160441,-0.99414,0.021545,0.392897,0.792611,0.510427,0.231119,0.72925,3,1,1,1,2,1,2,2,6,1,2,1,-0.497649,2.944828,-4.778719,2.785395,-0.791527,-2.524437,1.767596,2.305497,ACAGCBFDEA,19.04157,0,2,1


In [5]:
# 5. FE #

# first do TE

train, test = train0.copy(), test0.copy()

display(train.head(), test.head())
train, test = TargetEncoderMP(train, test, ['f_27'], 'target')
display(train.head(), test.head())







# then do OHE with columntransformer


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,10,-0.552998,-0.340897,1.066008,0.820531,-1.651332,-0.325655,0.236907,2,2,2,2,2,1,3,1,2,0,1,3,1.791614,-0.075696,-2.238481,4.445503,-0.024057,2.940171,-1.730239,-1.741727,ADBBBBBSDC,-166.024405,0,1,1
1,62,-1.321287,-1.260372,0.579082,-1.816321,-0.114923,2.517207,-1.002934,1,2,5,4,2,4,1,0,2,3,2,4,-3.129782,-2.541379,-0.980057,1.539227,3.31967,-6.805451,-1.769838,-1.714182,BBADDBCJBE,-272.93109,0,0,1
2,67,0.504814,0.10184,0.761814,-1.830958,0.105315,-0.107404,-0.38741,4,7,3,1,2,3,3,2,1,2,3,2,-0.060139,1.687031,-5.556003,4.790562,-6.370943,0.382,5.253365,-1.861232,BABGCBBTAB,141.289169,0,1,1
3,80,-0.237795,0.021497,-0.468444,-0.723512,-1.489679,1.20031,0.321046,1,5,0,3,2,5,3,3,1,2,5,2,-1.403882,2.448212,1.289593,0.986755,-6.309739,0.454259,2.914856,1.522981,ADAGBBCKAD,-55.387164,1,2,1
4,82,0.163361,0.593798,-0.499568,-0.550191,-1.192901,0.462731,0.569567,5,1,3,1,2,3,2,1,2,1,0,0,2.55729,-2.862385,-0.055949,-2.142073,-0.152629,3.043954,-2.634945,0.530955,ADBBAABCAG,210.230707,0,0,1


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
30693,306122,0.407043,-0.806163,-0.34728,0.901392,0.030624,1.021695,-1.538214,2,0,3,4,1,3,6,0,3,1,2,2,-1.253169,-0.398908,7.642284,-6.890171,4.232027,2.876797,-3.005297,1.626928,BCBCCBBAEA,131.539744,1,1,1
49647,496736,0.438078,-0.001046,0.993016,-1.157652,-0.832134,-1.768345,-2.262174,0,1,3,4,4,4,3,2,5,2,2,3,-0.353976,-0.519648,-1.963529,-0.418211,-1.148162,-7.600216,-0.368006,1.62161,ABBEAADHCF,-269.542892,0,2,1
16096,160441,-0.99414,0.021545,0.392897,0.792611,0.510427,0.231119,0.72925,3,1,1,1,2,1,2,2,6,1,2,1,-0.497649,2.944828,-4.778719,2.785395,-0.791527,-2.524437,1.767596,2.305497,ACAGCBFDEA,19.04157,0,2,1
75090,750998,-0.453302,-1.203734,2.011548,0.310198,0.535364,-0.719162,-1.258001,1,0,7,2,3,1,3,4,1,2,1,1,-1.194457,3.470276,-2.770316,1.234959,2.744317,-1.42183,-0.795167,1.819991,ABBABAAGBG,71.883708,0,0,0
80069,800739,-0.277485,-1.413096,-2.251319,-1.185112,-0.802144,1.095336,-0.60666,1,0,3,4,2,4,2,2,2,0,2,0,1.857999,-2.379252,2.00849,1.426356,1.097634,1.462567,-0.305549,0.072295,ACBEBBBQDB,-2.147996,0,2,0


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
0,10,-0.552998,-0.340897,1.066008,0.820531,-1.651332,-0.325655,0.236907,2,2,2,2,2,1,3,1,2,0,1,3,1.791614,-0.075696,-2.238481,4.445503,-0.024057,2.940171,-1.730239,-1.741727,-166.024405,0,1,1,0.484878
1,62,-1.321287,-1.260372,0.579082,-1.816321,-0.114923,2.517207,-1.002934,1,2,5,4,2,4,1,0,2,3,2,4,-3.129782,-2.541379,-0.980057,1.539227,3.31967,-6.805451,-1.769838,-1.714182,-272.93109,0,0,1,0.484878
2,67,0.504814,0.10184,0.761814,-1.830958,0.105315,-0.107404,-0.38741,4,7,3,1,2,3,3,2,1,2,3,2,-0.060139,1.687031,-5.556003,4.790562,-6.370943,0.382,5.253365,-1.861232,141.289169,0,1,1,0.484878
3,80,-0.237795,0.021497,-0.468444,-0.723512,-1.489679,1.20031,0.321046,1,5,0,3,2,5,3,3,1,2,5,2,-1.403882,2.448212,1.289593,0.986755,-6.309739,0.454259,2.914856,1.522981,-55.387164,1,2,1,0.484878
4,82,0.163361,0.593798,-0.499568,-0.550191,-1.192901,0.462731,0.569567,5,1,3,1,2,3,2,1,2,1,0,0,2.55729,-2.862385,-0.055949,-2.142073,-0.152629,3.043954,-2.634945,0.530955,210.230707,0,0,1,0.484878


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target,f_27_encoded
30693,306122,0.407043,-0.806163,-0.34728,0.901392,0.030624,1.021695,-1.538214,2,0,3,4,1,3,6,0,3,1,2,2,-1.253169,-0.398908,7.642284,-6.890171,4.232027,2.876797,-3.005297,1.626928,131.539744,1,1,1,0.548884
49647,496736,0.438078,-0.001046,0.993016,-1.157652,-0.832134,-1.768345,-2.262174,0,1,3,4,4,4,3,2,5,2,2,3,-0.353976,-0.519648,-1.963529,-0.418211,-1.148162,-7.600216,-0.368006,1.62161,-269.542892,0,2,1,0.548826
16096,160441,-0.99414,0.021545,0.392897,0.792611,0.510427,0.231119,0.72925,3,1,1,1,2,1,2,2,6,1,2,1,-0.497649,2.944828,-4.778719,2.785395,-0.791527,-2.524437,1.767596,2.305497,19.04157,0,2,1,0.548882
75090,750998,-0.453302,-1.203734,2.011548,0.310198,0.535364,-0.719162,-1.258001,1,0,7,2,3,1,3,4,1,2,1,1,-1.194457,3.470276,-2.770316,1.234959,2.744317,-1.42183,-0.795167,1.819991,71.883708,0,0,0,0.423859
80069,800739,-0.277485,-1.413096,-2.251319,-1.185112,-0.802144,1.095336,-0.60666,1,0,3,4,2,4,2,2,2,0,2,0,1.857999,-2.379252,2.00849,1.426356,1.097634,1.462567,-0.305549,0.072295,-2.147996,0,2,0,0.374857


In [6]:
# 6. fit XGBoost #

In [7]:
# 7. Do FI analysis