A notebook for doing 
1. scaling
2. imputation

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
# import timm
from pathlib import Path
import os
import math
import seaborn as sns
from datetime import datetime

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft
import missingno as msno

from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [4]:
%matplotlib inline
%config Completer.use_jedi = False
# os.environ['WANDB_NOTEBOOK_NAME'] = 'imputation_20210902.ipynb'
# config = {
#     # model config
#     "model":None,
#     "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
#     "booster": 'dart', # dart may be marginally better, but will opt for this quicker approach as a default
#     "n_estimators": 400, 
#     "max_depth": 3,
#     "learning_rate": 0.1522,
#     "test_size": 0.2,
#     "scaler": MaxAbsScaler,
# #     "task_type": "GPU", # for CatBoost only
# #     "reg_alpha": 2.8,
# #     "reg_lambda": 3.987,
# #     "feature_selector": SelectKBest,
# #     "k_best": 80,
# #     "feature_selection_scoring": f_regression,
#     'random_state': 42,
#     'subsample': 1,
#     'n_jobs': -1,
#     'verbosity': 1,
#     'k_folds': 5,
#     'features_created': False,
#     'feature_creator': None,
# }

# config_run = {
#     # wandb config:
#     'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
#     'tags': ['XGBoost', 'kfold', 'scaling'],
#     'notes': "A straight-up replication of previous best mdoel -- a 400 estimator Dart-boosted one -- with k-fold ensembling. No feature generation or selection.",
# }

In [5]:
datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')

In [4]:
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [6]:
# load unaltered dataset
df = pd.read_feather(path='dataset_df.feather')
df.index.name = 'id'

In [8]:
df.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,86.489,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,9953.6,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,15827.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1
3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,-36.837,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1
4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,144.12,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,,0.23049,1


In [9]:
y = df.claim

In [10]:
features = [x for x in df.columns if x != 'claim']
X = df[features]

# Scaling

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Imputation

## KNN Imputer

In [12]:
imputer = KNNImputer(n_neighbors=5)

In [16]:
X_imputed = imputer.fit_transform(X_scaled)

In [17]:
dump(X_imputed, 'X_KNNImputed.joblib')

['X_KNNImputed.joblib']

**NOTE** that that took about 30 hours with the default `n_neighbors=5` to make 1820782 imputations for 598455 of the 957919 rows (~62%) on my desktop machine.

In [None]:
# !mv X_KNNImputed.joblib X_KNNImputed_StandardScaled.joblib

In [18]:
X.columns

Index(['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       ...
       'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117',
       'f118'],
      dtype='object', length=118)

In [20]:
X_StandardScaled_KNNimputed_5NN = pd.DataFrame(X_imputed, columns=X.columns)

In [21]:
X_StandardScaled_KNNimputed_5NN.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
0,0.422121,-2.336057,-0.640028,-0.865135,-0.108153,-4.793134,-1.164104,-0.602909,-0.602437,-0.520139,...,-0.96518,0.414373,-0.364292,-0.482119,-0.878642,-0.635844,0.006302,-0.622475,-1.210113,1.122438
1,0.245597,-0.316947,1.208458,0.35427,1.065282,-0.360618,0.07918,-0.745021,0.885596,0.459871,...,1.939498,-1.982493,2.337448,-0.516377,0.237215,-0.673335,0.326417,-0.522752,-0.664832,-0.674975
2,2.016108,-2.41328,-0.492762,0.333754,1.063769,0.115233,0.53057,-0.047807,-0.76816,1.043223,...,-0.662936,0.762045,-0.971575,-0.518246,0.632622,-0.195099,-0.282497,-0.630488,-0.038341,-0.37306
3,1.426856,-2.315919,-0.512583,-0.82836,1.476518,3.561044,-1.18182,-0.340658,-0.740081,-0.532388,...,-1.376995,-0.803706,-0.005727,-0.506544,-1.792552,-0.629638,-0.2651,-0.635201,0.294645,-0.108728
4,0.597496,1.073064,-0.651186,0.455018,0.275424,-0.159085,0.719322,-0.90273,0.043314,-0.514415,...,1.773967,0.338317,-0.608098,-0.498863,-0.216082,-0.641494,2.384546,-0.635402,0.477433,-0.804986


In [22]:
X_StandardScaled_KNNimputed_5NN.to_feather(path='./X_StandardScaled_KNNImputed_5NN.feather')

## IterativeImputer

In [None]:
iter_imputer = IterativeImputer()