# 20211108 Feature Reduction, Concatenation
After weak results on the RobustScaler ==> PCA (MLE) ==> UMAP preprocessing pipeline, I'm going to try something a bit different: concatenate that reduced dataset onto the (RobustScaled) original dataset.

In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"nb_{datetime.now().strftime('%Y%m%d')}.ipynb"

Now, non-stdlib imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

from BorutaShap import BorutaShap

In [5]:
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep#, SAINT, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from torchmetrics import AUROC
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [6]:
# import category_encoders as ce

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Now, datapath setup

In [7]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [datapath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


In [8]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

# Generating New Dataset

In [9]:
# load the parent dataframes
manifold_source = datapath/'X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'
original_source = datapath/'X_orig.feather'

manifold_df = pd.DataFrame(load(manifold_source))
original_df = pd.read_feather(original_source)

In [10]:
manifold_df.head()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2.05373,-1.756666,11.37886,6.936342,6.235762,8.824183,9.119911,3.103356,9.724831,3.719539
1,1.554933,-0.268288,7.588934,7.686738,6.930284,8.36887,7.913663,0.505868,8.038529,3.076725
2,0.963801,-0.15052,10.355945,7.051244,6.157423,10.085479,8.593321,2.286653,10.166838,3.171345
3,1.592505,0.211458,13.213996,5.814211,6.522682,9.577888,9.720708,0.139408,10.984689,0.582962
4,1.18672,0.064047,10.237234,7.400344,6.29491,9.111214,8.108749,1.990173,10.146399,2.383465


In [12]:
manifold_df.columns = [f'm{col}' for col in range(10)]

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [13]:
manifold_df.head()

Unnamed: 0,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9
0,2.05373,-1.756666,11.37886,6.936342,6.235762,8.824183,9.119911,3.103356,9.724831,3.719539
1,1.554933,-0.268288,7.588934,7.686738,6.930284,8.36887,7.913663,0.505868,8.038529,3.076725
2,0.963801,-0.15052,10.355945,7.051244,6.157423,10.085479,8.593321,2.286653,10.166838,3.171345
3,1.592505,0.211458,13.213996,5.814211,6.522682,9.577888,9.720708,0.139408,10.984689,0.582962
4,1.18672,0.064047,10.237234,7.400344,6.29491,9.111214,8.108749,1.990173,10.146399,2.383465


In [11]:
original_df.head()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,0.010739,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,0.135838,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719
2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,0.11731,4.883,0.085222,0.032396,0.116092,-0.001688,-0.520069,2.14112,0.124464,0.148209
3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,-0.015347,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873
4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,0.013781,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798


In [23]:
scaler = RobustScaler()
original_df = scaler.fit_transform(original_df)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [26]:
original_df = pd.DataFrame(original_df, columns=[f'f{col}' for col in range(100)])

In [27]:
original_df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.023869,0.414343,-0.003178,0.223129,0.219181,-0.549174,0.356604,-0.117173,-0.149785,-0.569723,...,-0.615937,-0.519509,-0.523,-1.07358,-0.111175,0.042435,0.625515,-0.282294,0.287257,2.097848
1,0.073411,-0.324111,-0.220699,0.301799,0.406584,0.980651,-0.58429,-1.216782,0.824004,-0.258296,...,1.142983,0.432846,-0.487967,1.07122,0.943432,7.115126,-1.115475,-0.041835,-0.812236,-0.388992
2,-0.165673,-0.391725,0.386256,-0.178365,-0.372808,0.210182,0.863859,0.518747,-0.268295,-0.021567,...,0.882475,1.006638,0.153544,-0.380862,0.548134,-0.83391,-1.213478,-0.217131,0.683318,1.034235
3,-0.301555,-0.872802,2.498527,-0.301544,-0.587486,-0.414987,-0.039545,0.787011,0.807038,0.794548,...,-0.982719,0.43858,-0.809415,-1.016802,-0.014837,-0.273769,-0.812462,-0.286376,-0.185636,-0.156724
4,-0.272393,0.460876,0.086985,-0.197278,-0.465605,-0.192678,0.518429,-1.042825,0.356489,-0.30174,...,-0.573168,-0.192062,-1.052588,0.768968,0.641618,-0.321887,-0.60563,-0.64512,-0.729316,0.165118


In [28]:
hybrid_df = original_df.join(manifold_df)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [29]:
hybrid_df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9
0,0.023869,0.414343,-0.003178,0.223129,0.219181,-0.549174,0.356604,-0.117173,-0.149785,-0.569723,...,2.05373,-1.756666,11.37886,6.936342,6.235762,8.824183,9.119911,3.103356,9.724831,3.719539
1,0.073411,-0.324111,-0.220699,0.301799,0.406584,0.980651,-0.58429,-1.216782,0.824004,-0.258296,...,1.554933,-0.268288,7.588934,7.686738,6.930284,8.36887,7.913663,0.505868,8.038529,3.076725
2,-0.165673,-0.391725,0.386256,-0.178365,-0.372808,0.210182,0.863859,0.518747,-0.268295,-0.021567,...,0.963801,-0.15052,10.355945,7.051244,6.157423,10.085479,8.593321,2.286653,10.166838,3.171345
3,-0.301555,-0.872802,2.498527,-0.301544,-0.587486,-0.414987,-0.039545,0.787011,0.807038,0.794548,...,1.592505,0.211458,13.213996,5.814211,6.522682,9.577888,9.720708,0.139408,10.984689,0.582962
4,-0.272393,0.460876,0.086985,-0.197278,-0.465605,-0.192678,0.518429,-1.042825,0.356489,-0.30174,...,1.18672,0.064047,10.237234,7.400344,6.29491,9.111214,8.108749,1.990173,10.146399,2.383465


In [30]:
hybrid_df.to_feather(datapath/'X_hybrid_20211108.feather')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [31]:
hybrid_params = {
    'train_source': (str(datapath/'X_hybrid_20211108.feather'), [str(original_source), str(manifold_source)]), 
    'target_source': str(datapath/'y_orig.joblib'),
    'test_source': str(datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'scaler': [str(RobustScaler()), str(RobustScaler())],
    'pca': [None, str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib'))],
    'umap': [None, str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib'))],
    'dataset_description': "Hybrid of original dataset concatenated with PCA (MLE) ==> UMAP (10 cmpts, 15 neighbors); both RobustScaled from the start",
}   

Tensorflow not installed; ParametricUMAP will be unavailable


In [32]:
dump(hybrid_params, datapath/'X_hybrid_20211108_meta.joblib')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/X_hybrid_20211108_meta.joblib']

## Test Set

In [17]:
manifold_test_source = datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'
original_test_source = datapath/'test.feather'

manifold_test_df = load(manifold_test_source)

In [18]:
type(manifold_test_df)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


pandas.core.frame.DataFrame

In [19]:
original_test_df = pd.read_feather(original_test_source)
original_test_df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,600000,0.003229,4.83866,585.529,2.28291,0.71318,3.90783,0.480696,1.48227,4.89181,...,0.11291,1.07355,0.122149,0.08633,0.03601,0.010619,0.290343,1.8982,0.131533,0.012047
1,600001,0.008602,0.505536,-100.099,3.01267,0.027199,1.19461,5.03662,2.51744,4.55389,...,-0.020214,2.62234,0.123307,0.033063,0.123059,0.005771,-0.392923,3.68964,0.047418,0.120015
2,600002,1.461,2.43726,-112.964,3.54123,0.752338,4.33831,1.64808,4.69991,1.95025,...,-0.011036,2.03018,-0.000426,0.084091,0.123605,0.499554,4.05465,3.33067,0.108843,0.064687
3,600003,0.140556,3.08561,179.451,0.573945,0.057342,2.21679,1.62348,0.526174,1.54254,...,0.050117,0.221613,0.045298,0.129966,0.004015,0.018279,2.69658,-0.533491,0.052524,0.011058
4,600004,0.128876,5.19976,107.466,-0.497149,0.08022,0.458121,0.629839,5.24046,-0.232279,...,0.05886,2.66043,0.135425,0.036481,0.093912,0.056315,1.11071,3.58447,0.145319,-0.050393


In [20]:
original_test_df = original_test_df.drop('id', axis=1)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [22]:
original_test_df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.003229,4.83866,585.529,2.28291,0.71318,3.90783,0.480696,1.48227,4.89181,0.056351,...,0.11291,1.07355,0.122149,0.08633,0.03601,0.010619,0.290343,1.8982,0.131533,0.012047
1,0.008602,0.505536,-100.099,3.01267,0.027199,1.19461,5.03662,2.51744,4.55389,0.063876,...,-0.020214,2.62234,0.123307,0.033063,0.123059,0.005771,-0.392923,3.68964,0.047418,0.120015
2,1.461,2.43726,-112.964,3.54123,0.752338,4.33831,1.64808,4.69991,1.95025,0.005303,...,-0.011036,2.03018,-0.000426,0.084091,0.123605,0.499554,4.05465,3.33067,0.108843,0.064687
3,0.140556,3.08561,179.451,0.573945,0.057342,2.21679,1.62348,0.526174,1.54254,-0.02616,...,0.050117,0.221613,0.045298,0.129966,0.004015,0.018279,2.69658,-0.533491,0.052524,0.011058
4,0.128876,5.19976,107.466,-0.497149,0.08022,0.458121,0.629839,5.24046,-0.232279,0.030006,...,0.05886,2.66043,0.135425,0.036481,0.093912,0.056315,1.11071,3.58447,0.145319,-0.050393


In [33]:
original_test_df.to_feather(datapath/'X_test_orig-no_scaling.feather')

In [34]:
scaled_test = scaler.transform(original_test_df)
scaled_test_df = pd.DataFrame(scaled_test, columns=original_test_df.columns)
scaled_test_df.to_feather(datapath/'X_test_orig-RobustScaled.feather')


In [35]:
hybrid_df.head()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9
0,0.023869,0.414343,-0.003178,0.223129,0.219181,-0.549174,0.356604,-0.117173,-0.149785,-0.569723,...,2.05373,-1.756666,11.37886,6.936342,6.235762,8.824183,9.119911,3.103356,9.724831,3.719539
1,0.073411,-0.324111,-0.220699,0.301799,0.406584,0.980651,-0.58429,-1.216782,0.824004,-0.258296,...,1.554933,-0.268288,7.588934,7.686738,6.930284,8.36887,7.913663,0.505868,8.038529,3.076725
2,-0.165673,-0.391725,0.386256,-0.178365,-0.372808,0.210182,0.863859,0.518747,-0.268295,-0.021567,...,0.963801,-0.15052,10.355945,7.051244,6.157423,10.085479,8.593321,2.286653,10.166838,3.171345
3,-0.301555,-0.872802,2.498527,-0.301544,-0.587486,-0.414987,-0.039545,0.787011,0.807038,0.794548,...,1.592505,0.211458,13.213996,5.814211,6.522682,9.577888,9.720708,0.139408,10.984689,0.582962
4,-0.272393,0.460876,0.086985,-0.197278,-0.465605,-0.192678,0.518429,-1.042825,0.356489,-0.30174,...,1.18672,0.064047,10.237234,7.400344,6.29491,9.111214,8.108749,1.990173,10.146399,2.383465


In [37]:
manifold_test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.805481,0.382892,8.975375,6.223257,6.345812,8.811499,8.968513,2.358689,10.336686,1.723404
1,1.815467,1.81789,7.03835,7.159169,5.196725,10.220221,8.561192,0.262517,9.995115,1.153083
2,1.864622,0.302617,9.193873,6.061775,6.450215,8.866345,9.312219,2.476474,10.220789,1.538796
3,1.284698,-0.488024,10.264484,7.524199,6.294941,9.43299,8.738018,2.073075,9.901546,3.294871
4,1.239167,0.512479,10.570358,5.112319,6.427414,9.153562,8.228388,2.302179,8.561516,1.759488


In [38]:
manifold_test_df.columns = [f'm{col}' for col in range(10)]
manifold_test_df.head() 

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9
0,1.805481,0.382892,8.975375,6.223257,6.345812,8.811499,8.968513,2.358689,10.336686,1.723404
1,1.815467,1.81789,7.03835,7.159169,5.196725,10.220221,8.561192,0.262517,9.995115,1.153083
2,1.864622,0.302617,9.193873,6.061775,6.450215,8.866345,9.312219,2.476474,10.220789,1.538796
3,1.284698,-0.488024,10.264484,7.524199,6.294941,9.43299,8.738018,2.073075,9.901546,3.294871
4,1.239167,0.512479,10.570358,5.112319,6.427414,9.153562,8.228388,2.302179,8.561516,1.759488


In [39]:
manifold_test_df.shape

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


(540000, 10)

In [40]:
hybrid_df.shape

(600000, 110)

In [41]:
scaled_test_df.shape

(540000, 100)

In [42]:
scaled_test_df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,-0.254904,0.89266,1.746898,-0.142447,7.006148,0.516098,-0.888731,-0.432083,0.96168,-0.029473,...,0.82061,-0.52962,0.501055,0.466111,-0.285537,-0.673139,-0.898483,-0.314633,0.756057,-0.53504
1,-0.24042,-0.773034,-0.903499,0.153528,-0.36974,-0.546816,0.856103,-0.016286,0.827168,0.062887,...,-1.051149,0.094969,0.511953,-0.370394,0.620662,-0.73647,-1.164058,0.404404,-0.109484,0.709297
2,3.674804,-0.030461,-0.95323,0.3679,7.427188,0.68474,-0.441645,0.860348,-0.209231,-0.655958,...,-0.922098,-0.143834,-0.652471,0.430951,0.626346,5.713593,0.564647,0.260323,0.522578,0.071635
3,0.115288,0.218771,0.177143,-0.835567,-0.045634,-0.146373,-0.451066,-0.816119,-0.371523,-1.042097,...,-0.062278,-0.873185,-0.222166,1.151356,-0.618611,-0.573085,0.036785,-1.29065,-0.056943,-0.546438
4,0.083803,1.031471,-0.101125,-1.26998,0.200357,-0.835338,-0.831612,1.077471,-1.078004,-0.352785,...,0.060654,0.11033,0.625992,-0.316714,0.317232,-0.076241,-0.579619,0.362192,0.897914,-1.254661


In [43]:
hybrid_test_df = scaled_test_df.join(manifold_test_df)
hybrid_test_df.head()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9
0,-0.254904,0.89266,1.746898,-0.142447,7.006148,0.516098,-0.888731,-0.432083,0.96168,-0.029473,...,1.805481,0.382892,8.975375,6.223257,6.345812,8.811499,8.968513,2.358689,10.336686,1.723404
1,-0.24042,-0.773034,-0.903499,0.153528,-0.36974,-0.546816,0.856103,-0.016286,0.827168,0.062887,...,1.815467,1.81789,7.03835,7.159169,5.196725,10.220221,8.561192,0.262517,9.995115,1.153083
2,3.674804,-0.030461,-0.95323,0.3679,7.427188,0.68474,-0.441645,0.860348,-0.209231,-0.655958,...,1.864622,0.302617,9.193873,6.061775,6.450215,8.866345,9.312219,2.476474,10.220789,1.538796
3,0.115288,0.218771,0.177143,-0.835567,-0.045634,-0.146373,-0.451066,-0.816119,-0.371523,-1.042097,...,1.284698,-0.488024,10.264484,7.524199,6.294941,9.43299,8.738018,2.073075,9.901546,3.294871
4,0.083803,1.031471,-0.101125,-1.26998,0.200357,-0.835338,-0.831612,1.077471,-1.078004,-0.352785,...,1.239167,0.512479,10.570358,5.112319,6.427414,9.153562,8.228388,2.302179,8.561516,1.759488


In [44]:
hybrid_test_df.to_feather(datapath/'X_test_hybrid_20211108.feather')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [45]:
hybrid_params = {
    'train_source': (str(datapath/'X_hybrid_20211108.feather'), [str(original_source), str(manifold_source)]), 
    'target_source': str(datapath/'y_orig.joblib'),
    'test_source': (str(datapath/'X_test_hybrid_20211108.feather'), [str(datapath/'X_test_orig-RobustScaled.feather'), str(datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib')]),
    'scaler': [str(RobustScaler()), str(RobustScaler())],
    'pca': [None, str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib'))],
    'umap': [None, str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib'))],
    'dataset_description': "Hybrid of original dataset concatenated with PCA (MLE) ==> UMAP (10 cmpts, 15 neighbors); both RobustScaled from the start",
}   

In [46]:
dump(hybrid_params, datapath/'X_hybrid_20211108_meta.joblib')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/X_hybrid_20211108_meta.joblib']

In [48]:
check_params = load(datapath/'X_hybrid_20211108_meta.joblib')
check_params

{'train_source': ('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/X_hybrid_20211108.feather',
  ['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/X_orig.feather',
   '/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib']),
 'target_source': '/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/y_orig.joblib',
 'test_source': ('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/X_test_hybrid_20211108.feather',
  ['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/X_test_orig-RobustScaled.feather',
   '/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib']),
 'scaler': ['RobustScaler()', 'RobustScaler()'],
 'pca': [None, "PCA(n_components='mle', random_state=42)"],
 'umap': [None,
  "UMAP(n_comp

In [10]:
# # dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
# dataset_params = {
#     'train_source': str(datapath/'X_hybrid_20211108.feather'),
#     'target_source': str(datapath/'y_orig.joblib'),
#     'test_source': str(datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
#     'scaler': str(RobustScaler()),
#     'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
#     'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
# }   

# # referring back to the already-entered attributes, specify how the pipeline was sequenced
# # dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# # now, load the datasets and generate more metadata from them
# X = load(dataset_params['train_source'])
# y = load(dataset_params['target_source'])
# X_test = load(dataset_params['test_source'])

# dataset_params['feature_count'] = X.shape[1]
# dataset_params['instance_count'] = X.shape[0]
    

## Ex-Model Config

In [11]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
#     'random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
    **dataset_params
#     'features_created': False,
#     'feature_creator': None,
}

# Preprocessing Pipeline

Let's try something like the pipeline cited as "typical" on the [UMAP FAQ page](https://umap-learn.readthedocs.io/en/latest/faq.html):

> Consider a typical pipeline: high-dimensional embedding (300+) => PCA to reduce to 50 dimensions => UMAP to reduce to 10-20 dimensions => HDBSCAN for clustering / some plain algorithm for classification;



## 1 PCA

I'll skip the "high-dimensional embedding" part insofar as I don't have any categorical variables. And I'll let `mle` determine the number of dimensions.

In [16]:
from sklearn.decomposition import PCA

In [18]:
pca = PCA(n_components='mle', random_state=42)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [19]:
X_pca = pca.fit_transform(X)
dump(pca, datapath/'pca_mle-RobustScaled_orig_trainset.joblib')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/pca_mle-RobustScaled_orig_trainset.joblib']

In [20]:
X_pca_df = pd.DataFrame(X_pca, index=X.index)
X_pca_df.head()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89,90,91,92,93,94,95,96,97,98
0,-0.712092,-3.039303,-1.987172,-0.578403,-1.370641,11.474382,-2.379167,-0.937235,-2.594491,-2.151906,...,-0.179098,0.271471,0.298934,-0.224963,-0.132194,-0.284666,-0.558664,-0.252547,-0.44024,0.479289
1,-1.495624,-0.795305,-0.5587,0.389478,-1.006746,-1.348991,-1.0434,-1.305068,-0.7612,-1.488306,...,-0.031016,-0.135605,0.196139,0.388194,0.199077,-0.085138,-0.392932,0.342501,-0.295981,-0.95333
2,-0.92811,-2.377723,-1.526275,-0.318966,-1.614533,-2.112136,-1.716707,3.21808,-0.495027,-0.7477,...,0.100588,0.3525,-0.485187,-0.541339,5.1e-05,-0.03505,0.944401,-0.025735,0.098342,0.308155
3,-0.964916,-1.004537,-2.112259,-0.417141,-1.870255,-1.997467,-1.606478,-1.895262,0.203275,-1.368667,...,0.508566,0.954413,0.40627,0.462892,0.09234,0.261099,1.077867,0.381276,0.527398,0.233128
4,-0.440551,1.783703,-1.128116,0.15153,-1.305119,-1.391343,-0.766827,-0.897287,-0.753682,-1.99399,...,-0.154709,-0.707765,-0.268428,-0.617855,0.337783,0.17624,0.828138,-0.663256,-0.886622,-0.314749


In [22]:
X.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.023869,0.414343,-0.003178,0.223129,0.219181,-0.549174,0.356604,-0.117173,-0.149785,-0.569723,...,-0.615937,-0.519509,-0.523,-1.07358,-0.111175,0.042435,0.625515,-0.282294,0.287257,2.097848
1,0.073411,-0.324111,-0.220699,0.301799,0.406584,0.980651,-0.58429,-1.216782,0.824004,-0.258296,...,1.142983,0.432846,-0.487967,1.07122,0.943432,7.115126,-1.115475,-0.041835,-0.812236,-0.388992
2,-0.165673,-0.391725,0.386256,-0.178365,-0.372808,0.210182,0.863859,0.518747,-0.268295,-0.021567,...,0.882475,1.006638,0.153544,-0.380862,0.548134,-0.83391,-1.213478,-0.217131,0.683318,1.034235
3,-0.301555,-0.872802,2.498527,-0.301544,-0.587486,-0.414987,-0.039545,0.787011,0.807038,0.794548,...,-0.982719,0.43858,-0.809415,-1.016802,-0.014837,-0.273769,-0.812462,-0.286376,-0.185636,-0.156724
4,-0.272393,0.460876,0.086985,-0.197278,-0.465605,-0.192678,0.518429,-1.042825,0.356489,-0.30174,...,-0.573168,-0.192062,-1.052588,0.768968,0.641618,-0.321887,-0.60563,-0.64512,-0.729316,0.165118


# UMAP
Following directions [here](https://umap-learn.readthedocs.io/en/latest/basic_usage.html).

In [23]:
import umap

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
Tensorflow not installed; ParametricUMAP will be unavailable


In [24]:
reducer = umap.UMAP(n_components=10, # low end of typical for feature reduction
                    n_neighbors=15, # default value
                    random_state=42,
                    transform_seed=42,
                   )

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [25]:
umapper = reducer.fit(X_pca)
dump(reducer, datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/umap_reducer-20211107-n_comp10-n_neighbors15-rs42.joblib']

In [19]:
# umapper = load(datapath/'umap_reducer.joblib')

In [27]:
embedding = reducer.transform(X_pca)
embedding.shape
dump(embedding, datapath/'X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib']

In [21]:
# embedding = load(datapath/'X_orig-RobustScaled-umap_embedding.joblib')

In [28]:
embedding_df = pd.DataFrame(embedding)
embedding_df.head()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2.05373,-1.756666,11.37886,6.936342,6.235762,8.824183,9.119911,3.103356,9.724831,3.719539
1,1.554933,-0.268288,7.588934,7.686738,6.930284,8.36887,7.913663,0.505868,8.038529,3.076725
2,0.963801,-0.15052,10.355945,7.051244,6.157423,10.085479,8.593321,2.286653,10.166838,3.171345
3,1.592505,0.211458,13.213996,5.814211,6.522682,9.577888,9.720708,0.139408,10.984689,0.582962
4,1.18672,0.064047,10.237234,7.400344,6.29491,9.111214,8.108749,1.990173,10.146399,2.383465


Now let's transform the test set too.

In [31]:
exmodel_config X_test.drop('id', axis=1)
X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)
X_test_embedding = reducer.transform(X_test_pca)
X_test_embedding_df = pd.DataFrame(X_test_embedding)
dump(X_test_embedding_df, datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
X does not have valid feature names, but PCA was fitted with feature names


['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib']

In [32]:
X_test_embedding_df.head()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.805481,0.382892,8.975375,6.223257,6.345812,8.811499,8.968513,2.358689,10.336686,1.723404
1,1.815467,1.81789,7.03835,7.159169,5.196725,10.220221,8.561192,0.262517,9.995115,1.153083
2,1.864622,0.302617,9.193873,6.061775,6.450215,8.866345,9.312219,2.476474,10.220789,1.538796
3,1.284698,-0.488024,10.264484,7.524199,6.294941,9.43299,8.738018,2.073075,9.901546,3.294871
4,1.239167,0.512479,10.570358,5.112319,6.427414,9.153562,8.228388,2.302179,8.561516,1.759488


In [33]:
str(pca)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


"PCA(n_components='mle', random_state=42)"

In [34]:
str(reducer)

"UMAP(n_components=10, random_state=42, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True})"

In [36]:
exmodel_config['pca'] = str(pca)
exmodel_config['umap'] = str(reducer)
exmodel_config['scaler'] = str(scaler)
exmodel_config['type'] = 'preprocessing experiment'
exmodel_config['level'] = 1

In [37]:
# exmodel_config['train_source'] = 'X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'
# exmodel_config['test_source'] = 'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [38]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['preprocessing'],
    'notes': "Running Big Three GBMs with default parameters and a dataset transformed with 1. RobustScaler, 2. PCA (MLE), 3. UMAP (n_neighbors=15, n_components=10)"
}

In [39]:
def cross_validate_model(arch:str, X, y, X_test, params:dict={}, start_fold=0, 
                         exmodel_config=exmodel_config, wandb_config=wandb_config, 
                         random_state=42, shuffle_kfolds=True, wandb_tracked=True, encode_cats=False):
#     if exmodel_config['kfolds'] == 1:
#         print("Proceeding with holdout")
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                           test_size=0.2, 
#                                                           random_state=SEED)                 
    
    # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    # shuffle on the initial sets, but not subsequently -- performing the same operation twice means a very different dataset
    if shuffle_kfolds:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    else:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)
    
    if wandb_tracked:
        exmodel_config['arch'] = arch
        exmodel_config[f'{arch}_params'] = str(params)
        wandb.init(
            project="202111_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # setup for serialization
    # runpath = Path(modelpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds/")
    # (runpath).mkdir(exist_ok=True)
    
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    test_preds = np.zeros((X_test.shape[0]))
    
    # if using deep learning with pytorch-widedeep, do data preprocessing now, before splits
    if 'widedeep' in arch:
        # NOTE THAT ENCODING NOT DEPLOYED FOR THIS YET
        # preprocessing first
        wide_cols = [f for f in X.columns if X[f].nunique() == 2] #list(X_train.columns) if X_train.iloc[:,f].nunique() == 2] # binary indicator vars are wide
        cont_cols = [f for f in X.columns if X[f].nunique() > 2] #list(X_train.columns) if X_train.iloc[:,f].nunique() > 2] # others are cont

        # wide part
        # wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
        # X_wide = wide_preprocessor.fit_transform(X)
#         print(f"X_wide.shape = {X_wide.shape}")
#         X_wide = np.array(X_train[wide_cols])
        

        # deep part
        tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols)#, embed_cols=embed_cols, )
        X_tab = tab_preprocessor.fit_transform(X)   
#         print(f"X_tab.shape = {X_tab.shape}")
        
        # transforming the test set
        X_test_wide = wide_preprocessor.transform(X_test)
        X_test_tab = tab_preprocessor.transform(X_test)
        
        # at this point, X_wide, X_tab, X_test_wide, and X_test_tab will all be np.ndarrays
    
#     else: # if using a GBM, simply convert the pd.DataFrames to np.ndarrays
#         X = np.array(X) # CAN YOU USE CATEGORY_ENCODERS ON NP.NDARRAYS?
#         X_test = np.array(X_test)
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#         print(f"type(train_ids) = {type(train_ids)} and train_ids.shape = {train_ids.shape}")
#         print(f"type(valid_ids) = {type(valid_ids)} and train_ids.shape = {valid_ids.shape}")
        if fold < start_fold: # skip folds that are already trained
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            y_train, y_valid = y[train_ids], y[valid_ids] # y will be an np.ndarray already; handling will be same regardless of model
            if 'widedeep' in arch: # handle wide and deep tabs in parallel
                X_train_wide, X_valid_wide = X_wide[train_ids, :], X_wide[valid_ids, :]
                X_train_tab, X_valid_tab = X_tab[train_ids, :], X_tab[valid_ids, :]
#                 print(f"X_train_wide.shape = {X_train_wide.shape}")
#                 print(f"X_train_tab.shape = {X_train_tab.shape}")
#                 print(f"X_test_wide.shape = {X_test_wide.shape}")
#                 print(f"X_test_tab.shape = {X_test_tab.shape}")
            else: # handle datasets for GBMs
                if isinstance(X, np.ndarray):
                    X_train, X_valid = X[train_ids], X[valid_ids]
                else:
                    X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:] # bc need pandas.DataFrames for ce
                if encode_cats:
                    encoder = ce.WOEEncoder(cols=categoricals)
                    encoder.fit(X_train,y_train)
                    X_train = encoder.transform(X_train)
                    X_valid = encoder.transform(X_valid)
                # exmodel_config['feature_count'] = len(X.columns)
                    wandb.log({
                        'feature_count': X_train.shape[1],
                        'instance_count': X_train.shape[0],
                        'encoder': str(encoder)
                    })
#                 exmodel_config['instance_count'] = X_train.shape[0]
#                 exmodel_config['encoder'] = str(encoder)
#                     X_test = encoder.transform(X_test)
#                 y_train, y_valid = y[train_ids], y[valid_ids]
            
        # define models
        if arch == 'xgboost':
            model = XGBClassifier(
                booster='gbtree',
                tree_method='gpu_hist',
                random_state=random_state,
                n_jobs=-1, 
                verbosity=1, 
                objective='binary:logistic',
                **params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
            else:
                model.fit(X_train, y_train)
            y_valid_preds = model.predict_proba(X_valid)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict_proba(X_test)[:,1]


        elif arch == 'lightgbm':
            try:
                model = LGBMClassifier(
                    objective='binary',
                    random_state=random_state,
#                     device_type='cpu',
#                     n_jobs=-1,
    #                 eval_metric='auc',
                    device_type='gpu',
                    max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
                    gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
                    **params)
                
                if wandb_tracked:
                    model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
                else:
                    model.fit(X_train, y_train)
            except LightGBMError:
                model = LGBMClassifier(
                    objective='binary',
                    random_state=random_state,
                    device_type='cpu',
                    n_jobs=-1,
    #                 eval_metric='auc',
    #                 device_type='gpu',
    #                 max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
    #                 gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
                    **params)
                
                if wandb_tracked:
                    model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
                else:
                    model.fit(X_train, y_train)
            y_valid_preds = model.predict_proba(X_valid)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict_proba(X_test)[:,1]

            
        elif arch == 'catboost':
            model = CatBoostClassifier(
                task_type='GPU',
                silent=True,
                random_state=random_state,
                **params) 
        
            model.fit(X_train, y_train)
            y_valid_preds = model.predict_proba(X_valid)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict_proba(X_test)[:,1]
            
        elif 'widedeep' in arch: # only coding for TabMlp right now
#             X_train = pd.DataFrame(X_train, columns=[f"f{x}" for x in range(X_train.shape[1])])
#             X_valid = pd.DataFrame(X_valid, columns=[f"f{x}" for x in range(X_valid.shape[1])])
#             X_test = pd.DataFrame(X_test, columns=[f"f{x}" for x in range(X_test.shape[1])])
            
            wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
            deeptabular = TabMlp(
                mlp_hidden_dims=[64,32],
                column_idx=tab_preprocessor.column_idx,
            #     embed_input=tab_preprocessor.embeddings_input,
                continuous_cols=cont_cols,
            )
            
            # model instantiation and training
            model = WideDeep(wide=wide, deeptabular=deeptabular)
            
            
            n_epochs = 300

            # pytorch hyperparams
            wide_opt = AdamW(model.wide.parameters(), lr=0.1)
            deep_opt = AdamW(model.deeptabular.parameters(), lr=0.1)
            
            wide_sch = OneCycleLR(optimizer=wide_opt, max_lr=0.01, steps_per_epoch=X_train_wide.shape[0], epochs=n_epochs)
            deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_train_tab.shape[0], epochs=n_epochs)
            
            optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
            lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }
            
            
            callbacks = [
                LRHistory(n_epochs=n_epochs), 
            ]
            
            # trainer
            trainer = Trainer(model=model, 
                              objective='binary', 
                              metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                              seed=random_state, 
                              optimizers=optimizers,
                              callbacks=callbacks
                             )
            
#             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
            trainer.fit( # this is where problem is beginning
                X_wide=X_train_wide,
                X_tab=X_train_tab,
                target=y_train,
                n_epochs=n_epochs,
                batch_size=1024, # default value is 32
#                 val_split=0.2, # no need for this
            )
            
            y_valid_preds = trainer.predict_proba(X_wide=X_valid_wide, X_tab=X_valid_tab, batch_size=1024)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            
            # test set inference
            fold_test_preds = trainer.predict_proba(X_wide=X_test_wide, X_tab=X_test_tab, batch_size=1024)[:,1]
            test_preds += fold_test_preds
            

        
        
       
        
        

#         valid_loss = log_loss(y_valid, y_pred)
        # give the valid AUC score, for edification
        fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
        if wandb_tracked:
            wandb.log({f'fold{fold}_valid_roc_auc': fold_valid_auc})
        print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
        # dump(model, Path(runpath/f"{arch}_fold{fold}_rs{random_state}_model.joblib"))

    model_valid_auc = roc_auc_score(oof_y, oof_preds)
    print(f"Valid AUC score for {arch} model is {model_valid_auc}")
    if wandb_tracked:
        wandb.log({'overall_valid_auc': model_valid_auc,
                   'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()),
                   'model_seed': random_state,
                  })
        wandb.finish()
    
    # finalize test preds
    test_preds /= exmodel_config['kfolds']
    
    # save OOF preds and test-set preds
#     if 'widedeep' in arch:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_test_preds.joblib"))
    
#     else:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    
    if not (datapath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib").is_file():
        dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
#     if wandb_tracked:
# #         if 'widedeep' in arch:
#         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
#                    'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()), 
#         #                    'model_params': str(model.get_params()),
#         })
# #         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
# # #                    'model_params': str(model.get_params()),
# #                   })
#         wandb.finish()
    return oof_preds, test_preds
        

In [40]:
architectures = ['xgboost', 'lightgbm', 'catboost']#, 'widedeep-tabmlp', 'widedeep-saint']

In [41]:
model_seeds = [42]#, 1983, 550, 1701, 2063]

In [42]:
oof_lv1, test_lv1 = pd.DataFrame(), pd.DataFrame() # initialize dataframes

In [43]:
type(embedding)

numpy.ndarray

In [44]:
type(X_test_embedding_df)

pandas.core.frame.DataFrame

In [45]:
X = pd.DataFrame(embedding)
X_test = X_test_embedding_df

In [46]:
for arch in architectures:
    for model_seed in model_seeds:
        # update exmodel_config here
        oof_pred, test_pred = cross_validate_model(arch=arch, X=X, y=y, X_test=X_test, 
                                         wandb_config=wandb_config,
                                         random_state=model_seed,
                                         # params=lv1_params[arch],
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )
        oof_lv1[f'{arch}{model_seed}'] = oof_pred
        test_lv1[f'{arch}{model_seed}'] = test_pred

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


FOLD 0
---------------------------------------------------


Old style callback is deprecated.  See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html


XGBoostError: [20:04:32] ../src/gbm/gbtree.cc:588: Check failed: common::AllVisibleGPUs() >= 1 (0 vs. 1) : No visible GPU is found for XGBoost.
Stack trace:
  [bt] (0) /home/sf/anaconda3/envs/tabular-x/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x178349) [0x7f6c8a644349]
  [bt] (1) /home/sf/anaconda3/envs/tabular-x/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x179062) [0x7f6c8a645062]
  [bt] (2) /home/sf/anaconda3/envs/tabular-x/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x17985a) [0x7f6c8a64585a]
  [bt] (3) /home/sf/anaconda3/envs/tabular-x/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x1b3525) [0x7f6c8a67f525]
  [bt] (4) /home/sf/anaconda3/envs/tabular-x/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x68) [0x7f6c8a565478]
  [bt] (5) /home/sf/anaconda3/envs/tabular-x/lib/python3.8/lib-dynload/../../libffi.so.7(+0x69dd) [0x7f6cef00f9dd]
  [bt] (6) /home/sf/anaconda3/envs/tabular-x/lib/python3.8/lib-dynload/../../libffi.so.7(+0x6067) [0x7f6cef00f067]
  [bt] (7) /home/sf/anaconda3/envs/tabular-x/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(_ctypes_callproc+0x319) [0x7f6cededcd39]
  [bt] (8) /home/sf/anaconda3/envs/tabular-x/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(+0x137e5) [0x7f6cededd7e5]

