In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random
import gc; gc.enable()

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"deeptrainer_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
# from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

# from BorutaShap import BorutaShap
from gauss_rank_scaler import GaussRankScaler

In [5]:
from SAINT import TabAttention # from the official SAINT implementation as of 20211118, https://github.com/somepago/saint/blob/main/models/model.py

In [6]:
import torchinfo
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from torchmetrics import AUROC
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [7]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    studypath = root/'studies'
    
    for pth in [datapath, predpath, subpath, studypath]:
        pth.mkdir(exist_ok=True)
    

In [8]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

In [9]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [10]:
exmodel_config = {'arch': 'widedeep-saint',}

In [11]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    'train_source': str(datapath/'X_orig.feather'),
    'target_source': str(datapath/'y_corrected.joblib'),
    'test_source': str(datapath/'X_test_orig-no_scaling.feather'),
    'scaler': str(GaussRankScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
X = pd.read_feather(dataset_params['train_source'])# load(dataset_params['train_source'])
y = load(dataset_params['target_source'])
X_test = pd.read_feather(dataset_params['test_source']) #load(dataset_params['test_source'])

# dataset_params['feature_count'] = X.shape[1]
# dataset_params['instance_count'] = X.shape[0]

In [12]:
# decrease memory footprint
X = reduce_memory_usage(X)
X_test = reduce_memory_usage(X_test)

In [13]:
from sklearn.decomposition import PCA
import umap

In [14]:
preprocessing_params = {
    'binning': "pd.qcut(X.iloc[:,i],X.shape[1],labels=False,duplicates = 'drop')",
    'scaling, normalization': str(GaussRankScaler(epsilon=0.005)),
    # 'reduction': str(PCA(n_components='mle', random_state=42)),
    'reduction': None,
    'manifold': None,
    # 'manifold': str(umap.UMAP(n_components=10, n_neighbors=15, random_state=42, transform_seed=42,)),
    'clustering': None,
}

In [15]:
# h/t Laurent Pourchot https://www.kaggle.com/pourchot/in-python-tabular-denoising-residual-network/
if preprocessing_params['binning']:
    # 100 bins for the bins head of the NN (i.e. percentiles):
    X_bins = np.zeros((X.shape[0],X.shape[1])) # he used all available data for the first tuple entry, but I'll start like this
    X_bins_test = np.zeros((X_test.shape[0], X_test.shape[1]))
    for i in range(X.shape[1]): # assumes X is a pd.DataFrame
        X_bins[:,i] = pd.qcut(X.iloc[:,i],X.shape[1],labels=False,duplicates = 'drop')
        X_bins_test[:,i] = pd.qcut(X_test.iloc[:,i],X.shape[1],labels=False,duplicates = 'drop')
    X_bins = X_bins.astype(np.int8)
    X_bins_test = X_bins_test.astype(np.int8)
    X_bins = pd.DataFrame(X_bins, index=X.index, columns=[f'rkd_f{col}' for col in range(100)])
    X_bins_test = pd.DataFrame(X_bins_test, index=X_test.index, columns=[f'rkd_f{col}' for col in range(100)])

In [16]:
if preprocessing_params['scaling, normalization']:
    scaler = GaussRankScaler(epsilon=0.005)
    X_gauss = scaler.fit_transform(X)
    X_gauss_test = scaler.transform(X_test)
    X_gauss = pd.DataFrame(X_gauss, columns=X.columns, index=X.index)
    X_gauss_test = pd.DataFrame(X_gauss_test, columns=X_test.columns, index=X_test.index)

In [17]:
# if preprocessing_params['reduction']:
#     from sklearn.decomposition import PCA
#     pca = PCA(n_components='mle', random_state=42)
#     X_pca = pca.fit_transform(X_gauss)
#     # X_pca = pca.fit_transform(X)
#     X_pca = pd.DataFrame(X_pca, index=X.index)
#     import umap
#     reducer = umap.UMAP(n_components=10, # low end of typical for feature reduction
#                     n_neighbors=15, # default value
#                     random_state=42,
#                     transform_seed=42,
#                    )
#     umapper = reducer.fit(X_pca)
#     embedding = reducer.transform(X_pca)
#     embedding_df = pd.DataFrame(embedding,columns=[f'embed_{col}' for col in range(10)])
#     X_gauss = X_gauss.join(embedding_df)
#     # X = X.join(embedding_df)

In [18]:
X_pre = X_gauss.join(X_bins)
X_pre_test = X_gauss_test.join(X_bins_test)

In [19]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [20]:
X_pre.iloc[:, 100:].nunique()

rkd_f0     100
rkd_f1     100
rkd_f2     100
rkd_f3     100
rkd_f4     100
rkd_f5     100
rkd_f6     100
rkd_f7     100
rkd_f8     100
rkd_f9     100
rkd_f10    100
rkd_f11    100
rkd_f12    100
rkd_f13    100
rkd_f14    100
rkd_f15    100
rkd_f16    100
rkd_f17    100
rkd_f18    100
rkd_f19    100
rkd_f20    100
rkd_f21    100
rkd_f22    100
rkd_f23    100
rkd_f24    100
rkd_f25    100
rkd_f26    100
rkd_f27    100
rkd_f28    100
rkd_f29    100
rkd_f30    100
rkd_f31    100
rkd_f32    100
rkd_f33    100
rkd_f34    100
rkd_f35    100
rkd_f36    100
rkd_f37    100
rkd_f38    100
rkd_f39    100
rkd_f40    100
rkd_f41    100
rkd_f42    100
rkd_f43    100
rkd_f44    100
rkd_f45    100
rkd_f46    100
rkd_f47    100
rkd_f48    100
rkd_f49    100
rkd_f50    100
rkd_f51    100
rkd_f52    100
rkd_f53    100
rkd_f54    100
rkd_f55    100
rkd_f56    100
rkd_f57    100
rkd_f58    100
rkd_f59    100
rkd_f60    100
rkd_f61    100
rkd_f62    100
rkd_f63    100
rkd_f64    100
rkd_f65    100
rkd_f66   

In [21]:
X_pre_test.iloc[:, 100:].nunique()

rkd_f0     100
rkd_f1     100
rkd_f2     100
rkd_f3     100
rkd_f4     100
rkd_f5     100
rkd_f6     100
rkd_f7     100
rkd_f8     100
rkd_f9     100
rkd_f10    100
rkd_f11    100
rkd_f12    100
rkd_f13    100
rkd_f14    100
rkd_f15    100
rkd_f16    100
rkd_f17    100
rkd_f18    100
rkd_f19    100
rkd_f20    100
rkd_f21    100
rkd_f22    100
rkd_f23    100
rkd_f24    100
rkd_f25    100
rkd_f26    100
rkd_f27    100
rkd_f28    100
rkd_f29    100
rkd_f30    100
rkd_f31    100
rkd_f32    100
rkd_f33    100
rkd_f34    100
rkd_f35    100
rkd_f36    100
rkd_f37    100
rkd_f38    100
rkd_f39    100
rkd_f40    100
rkd_f41    100
rkd_f42    100
rkd_f43    100
rkd_f44    100
rkd_f45    100
rkd_f46    100
rkd_f47    100
rkd_f48    100
rkd_f49    100
rkd_f50    100
rkd_f51    100
rkd_f52    100
rkd_f53    100
rkd_f54    100
rkd_f55    100
rkd_f56    100
rkd_f57    100
rkd_f58    100
rkd_f59    100
rkd_f60    100
rkd_f61    100
rkd_f62    100
rkd_f63    100
rkd_f64    100
rkd_f65    100
rkd_f66   

In [22]:
# X_pre = pd.read_feather(datapath/'X_bins+GaussRankScaled+PCA,UMAP.feather')

In [23]:
# X_pre_cont = X_pre.iloc[:, :110].join(y)

In [24]:
# df_corr = X_pre_cont.corr()

In [25]:
dataset_params['feature_count'] = X_pre.shape[1]
dataset_params['instance_count'] = X_pre.shape[0]

In [26]:
if 'widedeep' in exmodel_config['arch']:
    cont_cols = X_pre.iloc[:,:100].columns # 110 if using PCA-UMAP embedding
    wide_cols = X_pre.iloc[:, 100:].columns # 110 if using PCA-UMAP embedding
    # # if not preprocessing
    # X_wide = X_pre[wide_cols]
    # X_tab = X_pre[cont_cols]
    
    # if preprocessing
    wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
    X_wide = wide_preprocessor.fit_transform(X_pre)
    X_wide_test = wide_preprocessor.transform(X_pre_test)
    # tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols, scale=False, for_transformer=False,embed_cols=wide_cols) # for TabMLP
    tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols, scale=False, for_transformer=True,embed_cols=wide_cols) # for SAINT
    X_tab = tab_preprocessor.fit_transform(X_pre)
    X_tab_test = tab_preprocessor.transform(X_pre_test)
    

In [27]:
widedeep_preprocessing_params = {
        'wide': str(wide_preprocessor),
        'deeptabular': str(tab_preprocessor),
    }
    
preprocessing_params.update(widedeep_preprocessing_params)
# print(preprocessing_params)

In [28]:
dump(X_wide_test, datapath/'X_wide_test_FIXED.joblib')
dump(X_tab_test, datapath/'X_tab_test_FIXED.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/X_tab_test_FIXED.joblib']

In [29]:
# X_wide = load(datapath/'X_wide.joblib')
# X_tab = load(datapath/'X_tab.joblib')
# X_wide_test = load(datapath/'X_wide_test.joblib')
# X_tab_test = load(datapath/'X_tab_test.joblib')
# dump(cont_cols, datapath/'cont_cols.joblib')

In [30]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config.update({
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
#     'random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': KFold(n_splits=5, shuffle=True, random_state=SEED), # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
    **dataset_params,
    **preprocessing_params
#     'features_created': False,
#     'feature_creator': None,
})

In [31]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['widedeep', 'deeplearning'],
    'notes': "Attempt using SAINT, default model params."
}

In [32]:
# X_pre_np = np.array(X_pre)

In [33]:
# deeptabular = TabMlp(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx)
# deeptabular = TabMlp(continuous_cols=list(range(110)), column_idx={str(x): x for x in range(len(cont_cols))})

In [34]:
type(X_wide)

numpy.ndarray

In [35]:
type(X_tab)

numpy.ndarray

In [36]:
def cross_validate_widedeep(arch, X_wide=X_wide, X_tab=X_tab, y=y, X_wide_test=X_wide_test, X_tab_test=X_tab_test, folds=list(range(5)), 
                            prev_epochs=0, n_epochs=20, exmodel_config=exmodel_config, wandb_config=wandb_config, 
                            random_state=42, shuffle_kfolds=True, wandb_tracked=True):
    """
    Modification of the `cross_validate_model` function used in my stacking notebooks, customized to the dataset and to deep learning approaches.
    """
    
    # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    # shuffle on the initial sets, but not subsequently -- performing the same operation twice means a very different dataset
    if shuffle_kfolds:
        kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)#exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    else:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)
    
    if wandb_tracked:
        exmodel_config['arch'] = 'widedeep-saint'
        # exmodel_config[f'model_params'] = str(model.parameters())
        wandb.init(
            project="202111_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    test_preds = np.zeros((X_test.shape[0]))
    
#     if start_fold == 4:
#     # immediately extend to include predictions from the 0 fold, which had a code bug
#         oof_preds.extend(load(predpath/'widedeep_saint-20211127-50epochs-fold0-oofpreds.joblib'))
#         oof_preds.extend(load(predpath/'widedeep_saint-20211127-50epochs-fold1-oofpreds.joblib'))
#         oof_preds.extend(load(predpath/'widedeep_saint-20211127-50epochs-fold2-oofpreds.joblib'))
#         oof_preds.extend(load(predpath/'widedeep_saint-20211127-55epochs-fold3-oofpreds.joblib'))
        
#         oof_y.extend(load(datapath/'y_valid-fold0.joblib'))
#         oof_y.extend(y[load(datapath/'kfold42-fold1-valid_ids.joblib')])
#         oof_y.extend(y[load(datapath/'kfold42-fold2-valid_ids.joblib')])
#         oof_y.extend(y[load(datapath/'kfold42-fold3-valid_ids.joblib')])
        
#         test_preds += load(predpath/'widedeep_saint-20211127-50epochs-fold0-testpreds.joblib')
#         test_preds += load(predpath/'widedeep_saint-20211127-50epochs-fold1-testpreds.joblib')
#         test_preds += load(predpath/'widedeep_saint-20211127-50epochs-fold2-testpreds.joblib')
#         test_preds += load(predpath/'widedeep_saint-20211127-55epochs-fold3-testpreds.joblib')
    
    # print(f"Before entering loop, oof_preds is length {len(oof_preds)}, oof_y is {len(oof_y)}, and test_preds is {test_preds.shape}")
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        torch.cuda.empty_cache()
#         print(f"type(train_ids) = {type(train_ids)} and train_ids.shape = {train_ids.shape}")
#         print(f"type(valid_ids) = {type(valid_ids)} and train_ids.shape = {valid_ids.shape}")
        if fold not in folds: # skip folds that are already trained
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            dump(train_ids, datapath/f'kfold42-fold{fold}-train_ids.joblib')
            dump(valid_ids, datapath/f'kfold42-fold{fold}-valid_ids.joblib')
            y_train, y_valid = y[train_ids], y[valid_ids] # y will be an np.ndarray already; handling will be same regardless of model
            print(f"y_train shape is {y_train.shape}, y_valid shape is {y_valid.shape}")
            # dump(y_train, datapath/f'y_train-fold{fold}.joblib')
            # dump(y_valid, datapath/f'y_valid-fold{fold}.joblib')
            # if isinstance(X, np.ndarray):
                # X_train, X_valid = X[train_ids], X[valid_ids]
            X_train_wide, X_train_tab = X_wide[train_ids], X_tab[train_ids]
            X_valid_wide, X_valid_tab = X_wide[valid_ids], X_tab[valid_ids]
                
                # X_train = pd.DataFrame(X_train, columns=
            # else:
            #     X_train_wide, X_train_tab = X_wide.iloc[train_ids,:], X_tab[train_ids,:]
            #     X_valid_wide, X_valid_tab = X_wide[valid_ids,:], X_tab[valid_ids,:]
            
            # print(f"X_train shape is {X_train.shape}")
            # print(f"X_valid shape is {X_valid.shape}")
            # print(f"X_test shape is {X_test.shape}")
            
            # scaling
            # scaler = GaussRankScaler()
            # X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
            # X_valid = pd.DataFrame(scaler.transform(X_valid), columns=X.columns)
            # X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
            
            # print("Scaling complete")
            # print(f"X_train shape is {X_train.shape}")
            # print(f"X_valid shape is {X_valid.shape}")
            # print(f"X_test shape is {X_test.shape}")
            
            # embedding & library-specific preprocessing
#             tab_preprocessor = TabPreprocessor(
#                 scale=False, # because GaussRank scaling already occurred
#                 # scale=True
#                 for_transformer=False, # change if using a Transformer-based model
#                 continuous_cols=X.columns,
#                 # continuous_cols=range(X.shape[1]), # since it'll be working on a numpy.ndarray
#                 auto_embed_dim=True, # uses fastai's rule of thumb
#             )#, embed_cols=embed_cols, )
#             X_train = tab_preprocessor.fit_transform(X_train)   
#             X_valid = tab_preprocessor.transform(X_valid)
#             X_test = tab_preprocessor.transform(X_test)
            
#             print("Tab preprocessing complete.")
#             print(f"Type of X_train is {type(X_train)}")
#             # print(f"X_train shape is {X_train.shape}")
#             # print(f"X_valid shape is {X_valid.shape}")
#             # print(f"X_test shape is {X_test.shape}")
            
#             # define model
#             deeptabular = TabMlp(
#                 mlp_hidden_dims=[64,32],
#                 column_idx=tab_preprocessor.column_idx,
#             #     embed_input=tab_preprocessor.embeddings_input,
#                 # continuous_cols=range(X.shape[1]), # since it'll be working on a numpy.ndarray
#                 continuous_cols=X.columns,
#             )

            if 'saint' in arch:
                wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
                deeptabular = SAINT(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx,)
                model = WideDeep(wide=wide, deeptabular=deeptabular)
                if prev_epochs > 0:
                    model.load_state_dict(torch.load(datapath/f"{arch}-20211127-weights-{prev_epochs}epochs-fold{fold}/wd_model.pt"))
        
                # n_epochs = 55

                # model = WideDeep(wide=None, deeptabular=deeptabular)

                # pytorch hyperparams
                wide_opt = AdamW(model.wide.parameters(),)
                deep_opt = SGD(model.deeptabular.parameters(),  lr=0.01, momentum=0.75)

                wide_sch = CosineAnnealingWarmRestarts(optimizer=wide_opt, T_0=5) 
                deep_sch = ReduceLROnPlateau(optimizer=deep_opt, )

                # deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_train_tab.shape[0], epochs=n_epochs)

                # optimizers = {'deeptabular': deep_opt }
                # lr_schedulers = {'deeptabular': deep_sch }

                optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
                lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }

                callbacks = [
                    LRHistory(n_epochs=n_epochs), 
                ]

                # trainer
                trainer = Trainer(model=model, 
                                  objective='binary', 
                                  metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                                  seed=random_state, 
                                  optimizers=optimizers,
                                  callbacks=callbacks
                                 )
                
            else:
                wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
                deeptabular = TabMlp(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx)
                model = WideDeep(wide=wide, deeptabular=deeptabular)
                
                wide_opt = AdamW(model.wide.parameters(), lr=0.1)
                deep_opt = AdamW(model.deeptabular.parameters(), lr=0.1)

                wide_sch = OneCycleLR(optimizer=wide_opt, max_lr=0.01, steps_per_epoch=X_wide_train.shape[0], epochs=n_epochs)
                deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_tab_train.shape[0], epochs=n_epochs)

                optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
                lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }


                callbacks = [
                    LRHistory(n_epochs=n_epochs), 
                ]

                # trainer
                trainer = Trainer(model=model, 
                                  objective='binary', 
                                  metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                                  seed=42, 
                                  optimizers=optimizers,
                                  callbacks=callbacks
                                 )
    #             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
            trainer.fit( 
                X_wide=X_train_wide,
                X_tab=X_train_tab,# np.array(X_train),
                target=np.array(y_train),
                n_epochs=n_epochs,
                batch_size=1048, # default value is 32
    #                 val_split=0.2, # no need for this
            )
        
            trainer.save(path=datapath/f'{arch}-20211127-weights-{prev_epochs + n_epochs}epochs-fold{fold}', save_state_dict=True)

            y_valid_preds = trainer.predict_proba(X_wide=np.array(X_valid_wide), X_tab=np.array(X_valid_tab), batch_size=1048)[:,1]
            dump(y_valid_preds, predpath/f'{arch}-20211127-{prev_epochs + n_epochs}epochs-fold{fold}-oofpreds.joblib')

            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)


            # test set inference
            fold_test_preds = trainer.predict_proba(X_wide=np.array(X_wide_test), X_tab=np.array(X_tab_test), batch_size=1048)[:,1]
            dump(fold_test_preds, predpath/f'{arch}-20211127-{prev_epochs + n_epochs}epochs-fold{fold}-testpreds.joblib')
            test_preds += fold_test_preds
            
            # print(f"NaNs in y_valid_preds: {np.isnan(y_valid_preds).any()}")
            # print(f"NaNs in y_valid: {np.isnan(y_valid).any()}")
        
        
       
        
        

    #         valid_loss = log_loss(y_valid, y_pred)
            # give the valid AUC score, for edification
            fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
            if wandb_tracked:
                wandb.log({f'fold{fold}_valid_roc_auc': fold_valid_auc})
            print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
        # dump(model, Path(runpath/f"{arch}_fold{fold}_rs{random_state}_model.joblib"))

    if len(folds) == 5:
        model_valid_auc = roc_auc_score(oof_y, oof_preds)
        print(f"Valid AUC score for {arch} model is {model_valid_auc}")
        if wandb_tracked:
            wandb.log({'overall_valid_auc': model_valid_auc,
                       'model_params': str(model.parameters()), #if 'widedeep' in arch else str(model.get_params()),
                       'model_seed': random_state,
                      })
            wandb.finish()
        # finalize test preds
        test_preds /= exmodel_config['kfolds']
        
    else:
        if wandb_tracked:
                wandb.log({#'overall_valid_auc': model_valid_auc,
                           'model_params': str(model.parameters()), #if 'widedeep' in arch else str(model.get_params()),
                           'model_seed': random_state,
                          })
                wandb.finish()
    
    
    # save OOF preds and test-set preds
#     if 'widedeep' in arch:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_test_preds.joblib"))
    
#     else:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    
    if not (datapath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib").is_file():
        dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
#     if wandb_tracked:
# #         if 'widedeep' in arch:
#         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
#                    'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()), 
#         #                    'model_params': str(model.get_params()),
#         })
# #         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
# # #                    'model_params': str(model.get_params()),
# #                   })
#         wandb.finish()
    return oof_preds, test_preds
        

In [37]:
# dump(y_valid, datapath/'y_valid-fold0.joblib')

In [38]:
# dump(X_wide, datapath/'X_wide.joblib')
# dump(X_tab, datapath/'X_tab.joblib')
# dump(X_wide_test, datapath/'X_wide_test.joblib')
# dump(X_tab_test, datapath/'X_tab_test.joblib')

In [39]:
del X_gauss, X_bins, X_pre

In [40]:
oof_preds, test_preds = cross_validate_widedeep('widedeep-tabmlp', prev_epochs=0, n_epochs=50)
# oof_preds, test_preds = cross_validate_widedeep('widedeep-saint', )
# dump(oof_preds, predpath/f'widedeep_saint-20211127-{n_epochs}epochs-mean-oofpreds.joblib')
# dump(test_preds, predpath/f'widedeep_saint-20211127-{n_epochs}epochs-mean-testpreds.joblib')

In [41]:
def cross_validate_widedeep(arch, X_wide=X_wide, X_tab=X_tab, y=y, X_wide_test=X_wide_test, X_tab_test=X_tab_test, folds=list(range(5)), 
                            prev_epochs=0, n_epochs=20, exmodel_config=exmodel_config, wandb_config=wandb_config, 
                            random_state=42, shuffle_kfolds=True, wandb_tracked=True):
    """
    Modification of the `cross_validate_model` function used in my stacking notebooks, customized to the dataset and to deep learning approaches.
    """
    
    # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    # shuffle on the initial sets, but not subsequently -- performing the same operation twice means a very different dataset
    if shuffle_kfolds:
        kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)#exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    else:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)
    
    if wandb_tracked:
        exmodel_config['arch'] = 'widedeep-saint'
        # exmodel_config[f'model_params'] = str(model.parameters())
        wandb.init(
            project="202111_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    test_preds = np.zeros((X_test.shape[0]))
    
#     if start_fold == 4:
#     # immediately extend to include predictions from the 0 fold, which had a code bug
#         oof_preds.extend(load(predpath/'widedeep_saint-20211127-50epochs-fold0-oofpreds.joblib'))
#         oof_preds.extend(load(predpath/'widedeep_saint-20211127-50epochs-fold1-oofpreds.joblib'))
#         oof_preds.extend(load(predpath/'widedeep_saint-20211127-50epochs-fold2-oofpreds.joblib'))
#         oof_preds.extend(load(predpath/'widedeep_saint-20211127-55epochs-fold3-oofpreds.joblib'))
        
#         oof_y.extend(load(datapath/'y_valid-fold0.joblib'))
#         oof_y.extend(y[load(datapath/'kfold42-fold1-valid_ids.joblib')])
#         oof_y.extend(y[load(datapath/'kfold42-fold2-valid_ids.joblib')])
#         oof_y.extend(y[load(datapath/'kfold42-fold3-valid_ids.joblib')])
        
#         test_preds += load(predpath/'widedeep_saint-20211127-50epochs-fold0-testpreds.joblib')
#         test_preds += load(predpath/'widedeep_saint-20211127-50epochs-fold1-testpreds.joblib')
#         test_preds += load(predpath/'widedeep_saint-20211127-50epochs-fold2-testpreds.joblib')
#         test_preds += load(predpath/'widedeep_saint-20211127-55epochs-fold3-testpreds.joblib')
    
    # print(f"Before entering loop, oof_preds is length {len(oof_preds)}, oof_y is {len(oof_y)}, and test_preds is {test_preds.shape}")
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        torch.cuda.empty_cache()
#         print(f"type(train_ids) = {type(train_ids)} and train_ids.shape = {train_ids.shape}")
#         print(f"type(valid_ids) = {type(valid_ids)} and train_ids.shape = {valid_ids.shape}")
        if fold not in folds: # skip folds that are already trained
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            dump(train_ids, datapath/f'kfold42-fold{fold}-train_ids.joblib')
            dump(valid_ids, datapath/f'kfold42-fold{fold}-valid_ids.joblib')
            y_train, y_valid = y[train_ids], y[valid_ids] # y will be an np.ndarray already; handling will be same regardless of model
            print(f"y_train shape is {y_train.shape}, y_valid shape is {y_valid.shape}")
            # dump(y_train, datapath/f'y_train-fold{fold}.joblib')
            # dump(y_valid, datapath/f'y_valid-fold{fold}.joblib')
            # if isinstance(X, np.ndarray):
                # X_train, X_valid = X[train_ids], X[valid_ids]
            X_train_wide, X_train_tab = X_wide[train_ids], X_tab[train_ids]
            X_valid_wide, X_valid_tab = X_wide[valid_ids], X_tab[valid_ids]
                
                # X_train = pd.DataFrame(X_train, columns=
            # else:
            #     X_train_wide, X_train_tab = X_wide.iloc[train_ids,:], X_tab[train_ids,:]
            #     X_valid_wide, X_valid_tab = X_wide[valid_ids,:], X_tab[valid_ids,:]
            
            # print(f"X_train shape is {X_train.shape}")
            # print(f"X_valid shape is {X_valid.shape}")
            # print(f"X_test shape is {X_test.shape}")
            
            # scaling
            # scaler = GaussRankScaler()
            # X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
            # X_valid = pd.DataFrame(scaler.transform(X_valid), columns=X.columns)
            # X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
            
            # print("Scaling complete")
            # print(f"X_train shape is {X_train.shape}")
            # print(f"X_valid shape is {X_valid.shape}")
            # print(f"X_test shape is {X_test.shape}")
            
            # embedding & library-specific preprocessing
#             tab_preprocessor = TabPreprocessor(
#                 scale=False, # because GaussRank scaling already occurred
#                 # scale=True
#                 for_transformer=False, # change if using a Transformer-based model
#                 continuous_cols=X.columns,
#                 # continuous_cols=range(X.shape[1]), # since it'll be working on a numpy.ndarray
#                 auto_embed_dim=True, # uses fastai's rule of thumb
#             )#, embed_cols=embed_cols, )
#             X_train = tab_preprocessor.fit_transform(X_train)   
#             X_valid = tab_preprocessor.transform(X_valid)
#             X_test = tab_preprocessor.transform(X_test)
            
#             print("Tab preprocessing complete.")
#             print(f"Type of X_train is {type(X_train)}")
#             # print(f"X_train shape is {X_train.shape}")
#             # print(f"X_valid shape is {X_valid.shape}")
#             # print(f"X_test shape is {X_test.shape}")
            
#             # define model
#             deeptabular = TabMlp(
#                 mlp_hidden_dims=[64,32],
#                 column_idx=tab_preprocessor.column_idx,
#             #     embed_input=tab_preprocessor.embeddings_input,
#                 # continuous_cols=range(X.shape[1]), # since it'll be working on a numpy.ndarray
#                 continuous_cols=X.columns,
#             )

            if 'saint' in arch:
                wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
                deeptabular = SAINT(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx,)
                model = WideDeep(wide=wide, deeptabular=deeptabular)
                if prev_epochs > 0:
                    model.load_state_dict(torch.load(datapath/f"{arch}-20211127-weights-{prev_epochs}epochs-fold{fold}/wd_model.pt"))
        
                # n_epochs = 55

                # model = WideDeep(wide=None, deeptabular=deeptabular)

                # pytorch hyperparams
                wide_opt = AdamW(model.wide.parameters(),)
                deep_opt = SGD(model.deeptabular.parameters(),  lr=0.01, momentum=0.75)

                wide_sch = CosineAnnealingWarmRestarts(optimizer=wide_opt, T_0=5) 
                deep_sch = ReduceLROnPlateau(optimizer=deep_opt, )

                # deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_train_tab.shape[0], epochs=n_epochs)

                # optimizers = {'deeptabular': deep_opt }
                # lr_schedulers = {'deeptabular': deep_sch }

                optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
                lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }

                callbacks = [
                    LRHistory(n_epochs=n_epochs), 
                ]

                # trainer
                trainer = Trainer(model=model, 
                                  objective='binary', 
                                  metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                                  seed=random_state, 
                                  optimizers=optimizers,
                                  callbacks=callbacks
                                 )
                
            else:
                wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
                deeptabular = TabMlp(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx)
                model = WideDeep(wide=wide, deeptabular=deeptabular)
                
                wide_opt = AdamW(model.wide.parameters(), lr=0.1)
                deep_opt = AdamW(model.deeptabular.parameters(), lr=0.1)

                wide_sch = OneCycleLR(optimizer=wide_opt, max_lr=0.01, steps_per_epoch=X_train_wide.shape[0], epochs=n_epochs)
                deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_train_tab.shape[0], epochs=n_epochs)

                optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
                lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }


                callbacks = [
                    LRHistory(n_epochs=n_epochs), 
                ]

                # trainer
                trainer = Trainer(model=model, 
                                  objective='binary', 
                                  metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                                  seed=42, 
                                  optimizers=optimizers,
                                  callbacks=callbacks
                                 )
    #             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
            trainer.fit( 
                X_wide=X_train_wide,
                X_tab=X_train_tab,# np.array(X_train),
                target=np.array(y_train),
                n_epochs=n_epochs,
                batch_size=1048, # default value is 32
    #                 val_split=0.2, # no need for this
            )
        
            trainer.save(path=datapath/f'{arch}-20211127-weights-{prev_epochs + n_epochs}epochs-fold{fold}', save_state_dict=True)

            y_valid_preds = trainer.predict_proba(X_wide=np.array(X_valid_wide), X_tab=np.array(X_valid_tab), batch_size=1048)[:,1]
            dump(y_valid_preds, predpath/f'{arch}-20211127-{prev_epochs + n_epochs}epochs-fold{fold}-oofpreds.joblib')

            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)


            # test set inference
            fold_test_preds = trainer.predict_proba(X_wide=np.array(X_wide_test), X_tab=np.array(X_tab_test), batch_size=1048)[:,1]
            dump(fold_test_preds, predpath/f'{arch}-20211127-{prev_epochs + n_epochs}epochs-fold{fold}-testpreds.joblib')
            test_preds += fold_test_preds
            
            # print(f"NaNs in y_valid_preds: {np.isnan(y_valid_preds).any()}")
            # print(f"NaNs in y_valid: {np.isnan(y_valid).any()}")
        
        
       
        
        

    #         valid_loss = log_loss(y_valid, y_pred)
            # give the valid AUC score, for edification
            fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
            if wandb_tracked:
                wandb.log({f'fold{fold}_valid_roc_auc': fold_valid_auc})
            print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
        # dump(model, Path(runpath/f"{arch}_fold{fold}_rs{random_state}_model.joblib"))

    if len(folds) == 5:
        model_valid_auc = roc_auc_score(oof_y, oof_preds)
        print(f"Valid AUC score for {arch} model is {model_valid_auc}")
        if wandb_tracked:
            wandb.log({'overall_valid_auc': model_valid_auc,
                       'model_params': str(model.parameters()), #if 'widedeep' in arch else str(model.get_params()),
                       'model_seed': random_state,
                      })
            wandb.finish()
        # finalize test preds
        test_preds /= exmodel_config['kfolds']
        
    else:
        if wandb_tracked:
                wandb.log({#'overall_valid_auc': model_valid_auc,
                           'model_params': str(model.parameters()), #if 'widedeep' in arch else str(model.get_params()),
                           'model_seed': random_state,
                          })
                wandb.finish()
    
    
    # save OOF preds and test-set preds
#     if 'widedeep' in arch:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_test_preds.joblib"))
    
#     else:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    
    if not (datapath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib").is_file():
        dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
#     if wandb_tracked:
# #         if 'widedeep' in arch:
#         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
#                    'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()), 
#         #                    'model_params': str(model.get_params()),
#         })
# #         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
# # #                    'model_params': str(model.get_params()),
# #                   })
#         wandb.finish()
    return oof_preds, test_preds
        

In [42]:
oof_preds, test_preds = cross_validate_widedeep('widedeep-tabmlp', prev_epochs=0, n_epochs=50)
# oof_preds, test_preds = cross_validate_widedeep('widedeep-saint', )
# dump(oof_preds, predpath/f'widedeep_saint-20211127-{n_epochs}epochs-mean-oofpreds.joblib')
# dump(test_preds, predpath/f'widedeep_saint-20211127-{n_epochs}epochs-mean-testpreds.joblib')