# widedeep_20211120
I want to start playing around with some NN architectures. Eventually, I want to try some straight PyTorch, but for starters, I'll use `widedeep`. As a scaler, I'll use RankGauss; I won't (yet) do any feature reduction or selection. I also won't (yet) use `cleanlab`, though I may try it in the future, either via the wrapper they suggest or via `skorch`

In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random
import gc; gc.enable()

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"deeptrainer_{datetime.now().strftime('%Y%m%d')}.ipynb"

Now, non-stdlib imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
# from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

# from BorutaShap import BorutaShap
from gauss_rank_scaler import GaussRankScaler


In [5]:
from SAINT import TabAttention # from the official SAINT implementation as of 20211118, https://github.com/somepago/saint/blob/main/models/model.py

In [6]:
import torchinfo
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from torchmetrics import AUROC
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [7]:
# import category_encoders as ce

  and should_run_async(code)


Now, datapath setup

In [8]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [datapath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


In [9]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

In [10]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [11]:
exmodel_config = {'arch': 'widedeep-tabmlp',}

In [12]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    'train_source': str(datapath/'X_orig.feather'),
    'target_source': str(datapath/'y_corrected.joblib'),
    'test_source': str(datapath/'X_test_orig-no_scaling.feather'),
    'scaler': str(GaussRankScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
X = pd.read_feather(dataset_params['train_source'])# load(dataset_params['train_source'])
y = load(dataset_params['target_source'])
X_test = pd.read_feather(dataset_params['test_source']) #load(dataset_params['test_source'])

# dataset_params['feature_count'] = X.shape[1]
# dataset_params['instance_count'] = X.shape[0]

In [13]:
# decrease memory footprint
X = reduce_memory_usage(X)
X_test = reduce_memory_usage(X_test)

Mem. usage decreased to 114.44 Mb (75.0% reduction)
Mem. usage decreased to 103.00 Mb (75.0% reduction)


# Preprocessing Data
Inspired a bit by Laurent Pourchot's Aug2021 Tabular Playground entry, I'm going to try to generate two versions of the dataset: a categorical one, using bins, and then (for now) a GaussRankScaled one. In the future, I might add further variations, e.g. with feature reduction via PCA and perhaps also UMAP and also denoising; I might also try other normalizations, e.g. Quantile.

In [14]:
from sklearn.decomposition import PCA
import umap

  warn(


In [15]:
preprocessing_params = {
    'binning': "pd.qcut(X.iloc[:,i],X.shape[1],labels=False,duplicates = 'drop')",
    'scaling, normalization': str(GaussRankScaler(epsilon=0.005)),
    'reduction': str(PCA(n_components='mle', random_state=42)),
    'manifold': str(umap.UMAP(n_components=10, n_neighbors=15, random_state=42, transform_seed=42,)),
    'clustering': None,
}

  and should_run_async(code)


## Binning (Generating wide cols)

In [16]:
# h/t Laurent Pourchot https://www.kaggle.com/pourchot/in-python-tabular-denoising-residual-network/
if preprocessing_params['binning']:
    # 100 bins for the bins head of the NN (i.e. percentiles):
    X_bins = np.zeros((X.shape[0],X.shape[1])) # he used all available data for the first tuple entry, but I'll start like this
    # X_bins_test = np.zeros((X_test.shape[0], X_test.shape[1]))
    for i in range(X.shape[1]): # assumes X is a pd.DataFrame
        X_bins[:,i] = pd.qcut(X.iloc[:,i],X.shape[1],labels=False,duplicates = 'drop')
        # X_bins_test[:,1] = pd.qcut(X_test.iloc[:,i],X_test.shape[1],labels=False,duplicates = 'drop')
    X_bins = X_bins.astype(np.int8)
    # X_bins_test = X_bins_test.astype(np.int8)
    X_bins = pd.DataFrame(X_bins, index=X.index, columns=[f'rkd_f{col}' for col in range(100)])
    # X_bins_test = pd.DataFrame(X_bins_test, index=X_test.index, columns=[f'rkd_f{col}' for col in range(100)])

## Reduction

In [17]:
if preprocessing_params['reduction']:
    from sklearn.decomposition import PCA
    pca = PCA(n_components='mle', random_state=42)
    # X_pca = pca.fit_transform(X_gauss)
    X_pca = pca.fit_transform(X)
    X_pca = pd.DataFrame(X_pca, index=X.index)
    import umap
    reducer = umap.UMAP(n_components=10, # low end of typical for feature reduction
                    n_neighbors=15, # default value
                    random_state=42,
                    transform_seed=42,
                   )
    umapper = reducer.fit(X_pca)
    embedding = reducer.transform(X_pca)
    embedding_df = pd.DataFrame(embedding,columns=[f'embed_{col}' for col in range(10)])
    # X_gauss = X_gauss.join(embedding_df)
    X = X.join(embedding_df)

## Normalizing (Preprocessing Deep Cols)

In [18]:
if preprocessing_params['scaling, normalization']:
    scaler = GaussRankScaler(epsilon=0.005)
    X_gauss = scaler.fit_transform(X)
    # X_gauss_test = scaler.transform(X_test)
    X_gauss = pd.DataFrame(X_gauss, columns=X.columns, index=X.index)
    # X_gauss_test = pd.DataFrame(X_gauss_test, columns=X_test.columns, index=X_test.index)

  and should_run_async(code)


## Clustering

## Preparing Data for WideDeep

In [19]:
X_pre = X_gauss.join(X_bins)
# X_pre_test = X_gauss_test.join(X_bins_test)

  and should_run_async(code)


## PREPARED LOADING

In [20]:
# X_pre = pd.read_feather(datapath/'X_bins+GaussRankScaled+PCA,UMAP.feather')

In [21]:
dataset_params['feature_count'] = X_pre.shape[1]
dataset_params['instance_count'] = X_pre.shape[0]

In [22]:
if 'widedeep' in exmodel_config['arch']:
    cont_cols = X_pre.iloc[:,:110].columns
    wide_cols = X_pre.iloc[:, 110:].columns
    wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
    X_wide = wide_preprocessor.fit_transform(X_pre)
    # X_wide_test = wide_preprocessor.transform(X_pre_test)
    # tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols, scale=False, for_transformer=False,embed_cols=wide_cols)
    tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols, scale=False, for_transformer=False,embed_cols=wide_cols)
    X_tab = tab_preprocessor.fit_transform(X_pre)
    # X_tab_test = tab_preprocessor.transform(X_pre_test)
    



In [23]:
widedeep_preprocessing_params = {
        'wide': str(wide_preprocessor),
        'deeptabular': str(tab_preprocessor),
    }
    
preprocessing_params.update(widedeep_preprocessing_params)
print(preprocessing_params)

{'binning': "pd.qcut(X.iloc[:,i],X.shape[1],labels=False,duplicates = 'drop')", 'scaling, normalization': 'GaussRankScaler(epsilon=0.005)', 'reduction': "PCA(n_components='mle', random_state=42)", 'manifold': 'UMAP(n_components=10, random_state=42)', 'clustering': None, 'wide': '<pytorch_widedeep.preprocessing.wide_preprocessor.WidePreprocessor object at 0x7ff038294a00>', 'deeptabular': '<pytorch_widedeep.preprocessing.tab_preprocessor.TabPreprocessor object at 0x7ff0a014ef70>'}


# Config Spec

In [24]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config.update({
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
#     'random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
    **dataset_params,
    **preprocessing_params
#     'features_created': False,
#     'feature_creator': None,
})

In [25]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['widedeep', 'deeplearning'],
    'notes': "Part of trials to decide dataset inputs for widedeep models. Scaling after reduction, concatenation."
}

In [26]:
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)

In [27]:
deeptabular = TabMlp(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx)
# deeptabular = SAINT(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx,)

In [28]:
model = WideDeep(wide=wide, deeptabular=deeptabular)
# model.load_state_dict(torch.load(datapath/"saint_20211121_weights_25epochs/wd_model.pt"))

In [29]:
# del X_deep_train, X_deep_valid
X_wide_train, X_wide_valid, y_train, y_valid = train_test_split(X_wide, y, test_size=0.2, random_state=42)
X_tab_train, X_tab_valid, _, _ = train_test_split(X_tab, y, test_size=0.2, random_state=42)

In [30]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

From (shaky) earlier sweep, I got these as best params for the TabMLP model:

```python
{'learning_rate': 0.08763568442121664,
 'weight_decay': 4.414536876494478e-05,
 'wide_optimization': 'AdamW',
 'tab_optimization': 'SGD',
 'wide_momentum': 0.2862031274746775,
 'tab_momentum': 0.7220103849055353,
 'wide_scheduler': 'CosineAnnealingWarmRestarts',
 'tab_scheduler': 'ReduceLROnPlateau'}
```

In [31]:
exmodel_config['training_params'] = str({'learning_rate': 0.08763568442121664, 'weight_decay': 4.414536876494478e-05, 'wide_optimization': 'AdamW', 'tab_optimization': 'SGD', 'wide_momentum': 0.2862031274746775, 'tab_momentum': 0.7220103849055353, 'wide_scheduler': 'CosineAnnealingWarmRestarts', 'tab_scheduler': 'ReduceLROnPlateau'})

In [None]:
wandb.init(
            project="202111_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [None]:
n_epochs = 20

lr = 0.08763568442121664
wd = 4.414536876494478e-05

wide_opt = AdamW(model.wide.parameters(), lr=lr)
deep_opt = SGD(model.deeptabular.parameters(), lr=lr, weight_decay=wd, momentum=0.7220103849055353)

# wide_sch = OneCycleLR(optimizer=wide_opt, max_lr=0.01, steps_per_epoch=X_wide_train.shape[0], epochs=n_epochs)
# deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_tab_train.shape[0], epochs=n_epochs)
wide_sch = CosineAnnealingWarmRestarts(optimizer=wide_opt, T_0=5) 
deep_sch = ReduceLROnPlateau(optimizer=deep_opt, )

optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }

callbacks = [
    LRHistory(n_epochs=n_epochs), 
]

In [None]:
# torch.cuda.empty_cache()

In [None]:
# trainer
trainer = Trainer(model=model, 
                  objective='binary', 
                  metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                  seed=42, 
                  optimizers=optimizers,
                  callbacks=callbacks
                 )

#             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
trainer.fit( 
    X_wide=X_wide_train,
    X_tab=X_tab_train,
    target=y_train,
    n_epochs=n_epochs,
    batch_size=1024, # default value is 32; 1024 works for TabMLP
#                 val_split=0.2, # no need for this
)

y_valid_preds = trainer.predict_proba(X_wide=X_wide_valid, X_tab=X_tab_valid, batch_size=1024)[:,1]
           
    

In [None]:
trainer.save(path=datapath/'widedeep_tabmlp-202111271032-weights-20epochs', save_state_dict=True)

In [None]:
valid_auc = roc_auc_score(y_score=y_valid_preds, y_true=y_valid)

In [None]:
valid_auc

In [None]:
wandb.log({'overall_valid_auc': valid_auc})

In [36]:
wandb.finish()

VBox(children=(Label(value=' 0.13MB of 0.13MB uploaded (0.06MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
overall_valid_auc,▁

0,1
overall_valid_auc,0.98348


# WideDeep Trainer

In [3]:
def trainer(model, X_wide = X_bins, X_deep=X_gauss, y=y, exmodel_config=exmodel_config, wandb_config=wandb_config, random_state=42, wandb_tracked=True):
    """
    Simple trainer wrapper for widedeep models, with holdout
    """
    # concatenate together wide and deep data
    X = X_wide.join(X_deep)
    
    wide = Wide()
    
    deeptabular = TabMlp(
        mlp_hidden_dims=[64,32],
        continuous_cols=X_deep.columns,
    # scaling with GaussRankScaler, before doing holdout split
    # scaler = GaussRankScaler(X)
    # scaler.fit_transform(X)
    
    # skipping denoising for now
    
    # holdout split
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)    
    
    if wandb_tracked:
        exmodel_config['arch'] = arch
        exmodel_config[f'{arch}_params'] = str(params)
        wandb.init(
            project="202111_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )

SyntaxError: invalid syntax (<ipython-input-3-ef23d42afe3c>, line 3)

In [17]:
saint_model = TabAttention(categories=None, num_continuous=X.shape[1], dim=)

  and should_run_async(code)


TypeError: __init__() missing 5 required keyword-only arguments: 'categories', 'num_continuous', 'dim', 'depth', and 'heads'

In [None]:
from sklearn.

In [34]:
def cross_validate_pytorch_model(arch:str, X, y, X_test, params:dict={}, start_fold=0, 
                         exmodel_config=exmodel_config, wandb_config=wandb_config, 
                         random_state=42, shuffle_kfolds=True, wandb_tracked=True, encode_cats=False):
    """
    Modification of the `cross_validate_model` function used in my stacking notebooks, customized to the dataset and to deep learning approaches.
    """
    
    # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    # shuffle on the initial sets, but not subsequently -- performing the same operation twice means a very different dataset
    if shuffle_kfolds:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    else:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)
    
    if wandb_tracked:
        exmodel_config['arch'] = arch
        exmodel_config[f'{arch}_params'] = str(params)
        wandb.init(
            project="202111_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    test_preds = np.zeros((X_test.shape[0]))
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#         print(f"type(train_ids) = {type(train_ids)} and train_ids.shape = {train_ids.shape}")
#         print(f"type(valid_ids) = {type(valid_ids)} and train_ids.shape = {valid_ids.shape}")
        if fold < start_fold: # skip folds that are already trained
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            y_train, y_valid = y[train_ids], y[valid_ids] # y will be an np.ndarray already; handling will be same regardless of model
            print(f"y_train shape is {y_train.shape}, y_valid shape is {y_valid.shape}")
            if isinstance(X, np.ndarray):
                X_train, X_valid = X[train_ids], X[valid_ids]
                # X_train = pd.DataFrame(X_train, columns=
            else:
                X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:] # bc need pandas.DataFrames for ce
            
            # print(f"X_train shape is {X_train.shape}")
            # print(f"X_valid shape is {X_valid.shape}")
            # print(f"X_test shape is {X_test.shape}")
            
            # scaling
            # scaler = GaussRankScaler()
            # X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
            # X_valid = pd.DataFrame(scaler.transform(X_valid), columns=X.columns)
            # X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
            
            # print("Scaling complete")
            # print(f"X_train shape is {X_train.shape}")
            # print(f"X_valid shape is {X_valid.shape}")
            # print(f"X_test shape is {X_test.shape}")
            
            # embedding & library-specific preprocessing
            tab_preprocessor = TabPreprocessor(
                # scale=False, # because GaussRank scaling already occurred
                scale=True
                for_transformer=False, # change if using a Transformer-based model
                continuous_cols=X.columns,
                # continuous_cols=range(X.shape[1]), # since it'll be working on a numpy.ndarray
                auto_embed_dim=True, # uses fastai's rule of thumb
            )#, embed_cols=embed_cols, )
            X_train = tab_preprocessor.fit_transform(X_train)   
            X_valid = tab_preprocessor.transform(X_valid)
            X_test = tab_preprocessor.transform(X_test)
            
            print("Tab preprocessing complete.")
            print(f"Type of X_train is {type(X_train)}")
            # print(f"X_train shape is {X_train.shape}")
            # print(f"X_valid shape is {X_valid.shape}")
            # print(f"X_test shape is {X_test.shape}")
            
            # define model
            deeptabular = TabMlp(
                mlp_hidden_dims=[64,32],
                column_idx=tab_preprocessor.column_idx,
            #     embed_input=tab_preprocessor.embeddings_input,
                # continuous_cols=range(X.shape[1]), # since it'll be working on a numpy.ndarray
                continuous_cols=X.columns,
            )

            n_epochs = 30

            model = WideDeep(wide=None, deeptabular=deeptabular)

            # pytorch hyperparams
            deep_opt = AdamW(model.parameters(), lr=0.1)

            # deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_train_tab.shape[0], epochs=n_epochs)

            # optimizers = {'deeptabular': deep_opt }
            # lr_schedulers = {'deeptabular': deep_sch }


            callbacks = [
                LRHistory(n_epochs=n_epochs), 
            ]



            # trainer
            trainer = Trainer(model=model, 
                              objective='binary', 
                              metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                              seed=random_state, 
                              optimizers=deep_opt,
                              callbacks=callbacks
                             )

    #             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
            trainer.fit( # this is where problem is beginning
                # X_wide=X_train_wide,
                X_tab=np.array(X_train),
                target=np.array(y_train),
                n_epochs=n_epochs,
                batch_size=1024, # default value is 32
    #                 val_split=0.2, # no need for this
            )

            y_valid_preds = trainer.predict_proba(X_tab=np.array(X_valid), batch_size=1024)[:,1]

            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)


            # test set inference
            fold_test_preds = trainer.predict_proba(X_tab=np.array(X_test), batch_size=1024)[:,1]
            test_preds += fold_test_preds
            
            print(f"NaNs in y_valid_preds: {np.isnan(y_valid_preds).any()}")
            print(f"NaNs in y_valid: {np.isnan(y_valid).any()}")
        
        
       
        
        

    #         valid_loss = log_loss(y_valid, y_pred)
            # give the valid AUC score, for edification
            fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
            if wandb_tracked:
                wandb.log({f'fold{fold}_valid_roc_auc': fold_valid_auc})
            print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
        # dump(model, Path(runpath/f"{arch}_fold{fold}_rs{random_state}_model.joblib"))

    model_valid_auc = roc_auc_score(oof_y, oof_preds)
    print(f"Valid AUC score for {arch} model is {model_valid_auc}")
    if wandb_tracked:
        wandb.log({'overall_valid_auc': model_valid_auc,
                   'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()),
                   'model_seed': random_state,
                  })
        wandb.finish()
    
    # finalize test preds
    test_preds /= exmodel_config['kfolds']
    
    # save OOF preds and test-set preds
#     if 'widedeep' in arch:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_test_preds.joblib"))
    
#     else:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    
    if not (datapath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib").is_file():
        dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
#     if wandb_tracked:
# #         if 'widedeep' in arch:
#         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
#                    'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()), 
#         #                    'model_params': str(model.get_params()),
#         })
# #         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
# # #                    'model_params': str(model.get_params()),
# #                   })
#         wandb.finish()
    return oof_preds, test_preds
        

  and should_run_async(code)


SyntaxError: invalid syntax (<ipython-input-34-5c6737837e8e>, line 68)

In [35]:
oof_preds, test_preds = cross_validate_pytorch_model('widedeep-TabMLP', X, y, X_test, wandb_tracked=False)

FOLD 0
---------------------------------------------------
y_train shape is (480000,), y_valid shape is (120000,)


epoch 1:   2%|▏         | 8/469 [00:00<00:06, 72.72it/s, loss=0.852, metrics={'acc': 0.498}] 

Tab preprocessing complete.
Type of X_train is <class 'numpy.ndarray'>


epoch 1: 100%|██████████| 469/469 [00:05<00:00, 78.36it/s, loss=0.647, metrics={'acc': 0.6435}]
epoch 2: 100%|██████████| 469/469 [00:06<00:00, 75.90it/s, loss=0.64, metrics={'acc': 0.6527}] 
epoch 3: 100%|██████████| 469/469 [00:06<00:00, 68.60it/s, loss=0.645, metrics={'acc': 0.6499}]
epoch 4: 100%|██████████| 469/469 [00:06<00:00, 76.83it/s, loss=0.648, metrics={'acc': 0.6462}]
epoch 5: 100%|██████████| 469/469 [00:06<00:00, 76.18it/s, loss=0.648, metrics={'acc': 0.6472}]
epoch 6: 100%|██████████| 469/469 [00:06<00:00, 74.80it/s, loss=0.645, metrics={'acc': 0.65}]  
epoch 7: 100%|██████████| 469/469 [00:05<00:00, 78.54it/s, loss=0.656, metrics={'acc': 0.634}] 
epoch 8: 100%|██████████| 469/469 [00:06<00:00, 72.74it/s, loss=0.693, metrics={'acc': 0.5112}]
epoch 9: 100%|██████████| 469/469 [00:06<00:00, 74.71it/s, loss=0.691, metrics={'acc': 0.5116}]
epoch 10: 100%|██████████| 469/469 [00:06<00:00, 76.26it/s, loss=0.699, metrics={'acc': 0.5246}]
epoch 11: 100%|██████████| 469/469 [00:

NaNs in y_valid_preds: True
NaNs in y_valid: False


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').