Outline:
1. Load evth.
2. Preclean data.
3. Train-test split.
4. Missing values.
5. FE.
6. Modeling.
7. FI.
8. predictions.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os, time, warnings, gzip, gc, random, math, shap, pickle, optuna
from IPython.display import display
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.inspection import permutation_importance
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_rows',200)

warnings.filterwarnings("ignore")

In [2]:
# target encoding code:

### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded


In [3]:
# 1. Load data #

train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
display(train.shape, train.head())

(900000, 33)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,1,3,3,3,1,6,1,0,7,4,0.298218,-0.919717,3.058541,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,0,2,3,0,1,0,4,6,0,-3.147667,-1.075434,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1
2,2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,2,6,6,4,3,1,2,2,1,4,2.820733,-3.485342,-0.784235,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,AAAEABCKAD,-195.599702,0,2,1
3,3,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3,2,1,0,1,6,4,2,3,3,0,3,1.081084,-2.100177,-2.343819,0.572594,-1.653213,1.686035,-2.533098,-0.608601,BDBBAACBCB,210.826205,0,0,1
4,4,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3,3,0,4,3,0,6,0,3,3,1,0,-0.126179,0.605033,1.133665,-3.912929,-1.430366,2.127649,-3.306784,4.371371,BDBCBBCHFE,-217.211798,0,1,1


In [4]:
display(train.target.value_counts())
num_cols = [col for col in train.columns if train[col].nunique()>10]
num_cols.remove('f_27')
cat_cols = list(set(train.columns) - set(num_cols) - set('target'))
print('num_cols: ', num_cols, '\n', 'cat_cols: ', cat_cols)
display(train[num_cols].describe())
cat_cols = list(set(train.columns) - set(num_cols) - set('target'))
display([train[col].value_counts() for col in cat_cols])

0    462161
1    437839
Name: target, dtype: int64

num_cols:  ['id', 'f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28'] 
 cat_cols:  ['f_29', 'f_30', 'f_27', 'target']


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28
count,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0
mean,449999.5,-0.000286,0.001165,0.001174,-0.001368,-0.000571,0.000284,-0.000709,2.03146,2.057998,2.362431,2.177637,1.803392,2.842373,2.239778,1.514686,2.101132,2.096713,1.858518,2.065131,0.308713,-0.17873,-0.156307,-0.009273,-0.369459,-0.342738,0.176549,0.357591,-0.380876
std,259807.765473,0.998888,0.999193,1.000514,1.000175,1.000167,0.999875,0.999942,1.656172,1.590955,1.637706,1.645953,1.537487,1.762835,1.538426,1.359213,1.569093,1.560169,1.467675,1.564783,2.316026,2.400494,2.484706,2.450797,2.453405,2.386941,2.416959,2.47602,238.773054
min,0.0,-4.599856,-4.682199,-4.642676,-4.658816,-4.748501,-4.750214,-4.842919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.280941,-11.257917,-13.310146,-11.85353,-12.301097,-11.416189,-11.918306,-14.300577,-1229.753052
25%,224999.75,-0.67549,-0.675162,-0.674369,-0.676114,-0.675909,-0.673437,-0.674876,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,-1.236061,-1.804612,-1.820063,-1.645585,-2.019739,-1.955956,-1.440424,-1.261598,-159.427418
50%,449999.5,0.001144,0.002014,0.002218,-0.002227,-0.001662,-0.000438,-0.001492,2.0,2.0,2.0,2.0,2.0,3.0,2.0,1.0,2.0,2.0,2.0,2.0,0.330249,-0.190571,-0.152668,0.03085,-0.390966,-0.340746,0.160912,0.404212,-0.519808
75%,674999.25,0.674337,0.675021,0.677505,0.672544,0.673789,0.675028,0.674749,3.0,3.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,1.880517,1.444508,1.507071,1.661676,1.255408,1.266673,1.795928,2.028219,158.987357
max,899999.0,4.749301,4.815699,4.961982,4.45492,4.948983,4.971881,4.822668,15.0,16.0,14.0,14.0,13.0,16.0,12.0,14.0,14.0,15.0,14.0,13.0,12.079667,11.475325,14.455426,11.34408,12.2471,12.389844,12.529179,12.913041,1229.562577


[0    588905
 1    311095
 Name: f_29, dtype: int64,
 2    303032
 0    300643
 1    296325
 Name: f_30, dtype: int64,
 BBBBBBCJBC    12
 BCBBBBCLBC    12
 BBBBBBDPCB    10
 BBBBBBDKBC    10
 ADBBBACQBC    10
               ..
 BFAFAADTCE     1
 BBBDAACOAE     1
 BBABAACGBB     1
 BFAABAGSDB     1
 BCAACADSCE     1
 Name: f_27, Length: 741354, dtype: int64,
 0    462161
 1    437839
 Name: target, dtype: int64]

In [5]:
#[train[col].value_counts() for col in train.columns if train[col].nunique()<10]
display(train.count())

id        900000
f_00      900000
f_01      900000
f_02      900000
f_03      900000
f_04      900000
f_05      900000
f_06      900000
f_07      900000
f_08      900000
f_09      900000
f_10      900000
f_11      900000
f_12      900000
f_13      900000
f_14      900000
f_15      900000
f_16      900000
f_17      900000
f_18      900000
f_19      900000
f_20      900000
f_21      900000
f_22      900000
f_23      900000
f_24      900000
f_25      900000
f_26      900000
f_27      900000
f_28      900000
f_29      900000
f_30      900000
target    900000
dtype: int64

In [6]:
# 3. split data #

train_x = train.copy()
train_y = train_x.pop('target')

display(train_x.shape, train_y.shape)

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.05, random_state=1)
display(X_train.shape, X_test.shape, y_train.shape)

(900000, 32)

(900000,)

(855000, 32)

(45000, 32)

(855000,)

In [7]:
# 5. FE #

# first do TE

# then do OHE with columntransformer


In [8]:
# 6. fit XGBoost #

In [9]:
# 7. Do FI analysis