# widedeep_20211120
I want to start playing around with some NN architectures. Eventually, I want to try some straight PyTorch, but for starters, I'll use `widedeep`. As a scaler, I'll use RankGauss; I won't (yet) do any feature reduction or selection. I also won't (yet) use `cleanlab`, though I may try it in the future, either via the wrapper they suggest or via `skorch`

In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random
import gc; gc.enable()

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"widedeep_{datetime.now().strftime('%Y%m%d')}.ipynb"

Now, non-stdlib imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
# from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

# from BorutaShap import BorutaShap
from gauss_rank_scaler import GaussRankScaler


In [5]:
from SAINT import TabAttention # from the official SAINT implementation as of 20211118, https://github.com/somepago/saint/blob/main/models/model.py

In [6]:
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from torchmetrics import AUROC
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [7]:
# import category_encoders as ce

  and should_run_async(code)


Now, datapath setup

In [8]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [datapath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


In [9]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

In [10]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [11]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    'train_source': str(datapath/'X_orig.feather'),
    'target_source': str(datapath/'y_orig.joblib'),
    'test_source': str(datapath/'X_test_orig-no_scaling.feather'),
    'scaler': str(GaussRankScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
X = pd.read_feather(dataset_params['train_source'])# load(dataset_params['train_source'])
y = load(dataset_params['target_source'])
X_test = pd.read_feather(dataset_params['test_source']) #load(dataset_params['test_source'])

dataset_params['feature_count'] = X.shape[1]
dataset_params['instance_count'] = X.shape[0]

In [12]:
# decrease memory footprint
X = reduce_memory_usage(X)
X_test = reduce_memory_usage(X_test)

  and should_run_async(code)


Mem. usage decreased to 114.44 Mb (75.0% reduction)
Mem. usage decreased to 103.00 Mb (75.0% reduction)


In [13]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
#     'random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
    **dataset_params
#     'features_created': False,
#     'feature_creator': None,
}

In [14]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['widedeep', 'deeplearning'],
    'notes': "Trying a variety of widedeep models, to see if I can get any working properly."
}

# Preprocessing Data
Inspired a bit by Laurent Pourchot's Aug2021 Tabular Playground entry, I'm going to try to generate two versions of the dataset: a categorical one, using bins, and then (for now) a GaussRankScaled one. In the future, I might add further variations, e.g. with feature reduction via PCA and perhaps also UMAP and also denoising; I might also try other normalizations, e.g. Quantile.

## Binning (Generating wide cols)

In [15]:
# h/t Laurent Pourchot https://www.kaggle.com/pourchot/in-python-tabular-denoising-residual-network/

# 100 bins for the bins head of the NN (i.e. percentiles):
X_bins = np.zeros((X.shape[0],X.shape[1])) # he used all available data for the first tuple entry, but I'll start like this

In [16]:
X_bins.shape

(600000, 100)

In [17]:
for i in range(X.shape[1]): # assumes X is a pd.DataFrame
    X_bins[:,i] = pd.qcut(X.iloc[:,i],X.shape[1],labels=False,duplicates = 'drop')
# blabeled = X_bins[:X.shape[0],:]
# bunlabeled = X_ins[X.shape[0]:,:]

In [18]:
X_bins

  and should_run_async(code)


array([[52., 70., 49., ..., 37., 64., 88.],
       [56., 35., 33., ..., 48., 10., 28.],
       [28., 31., 70., ..., 40., 79., 85.],
       ...,
       [94., 11., 31., ..., 16., 64., 46.],
       [80., 75., 89., ..., 58., 51., 17.],
       [64., 67., 56., ..., 37., 44., 41.]])

In [19]:
X_bins = X_bins.astype(np.int8)

In [20]:
X_bins = pd.DataFrame(X_bins, index=X.index, columns=[f'rkd_f{col}' for col in range(100)])

In [21]:
X_bins.head()

Unnamed: 0,rkd_f0,rkd_f1,rkd_f2,rkd_f3,rkd_f4,rkd_f5,rkd_f6,rkd_f7,rkd_f8,rkd_f9,...,rkd_f90,rkd_f91,rkd_f92,rkd_f93,rkd_f94,rkd_f95,rkd_f96,rkd_f97,rkd_f98,rkd_f99
0,52,70,49,60,61,23,68,44,42,20,...,18,22,22,6,44,52,83,37,64,88
1,56,35,33,63,69,95,22,1,88,36,...,90,70,24,94,87,97,2,48,10,28
2,28,31,70,41,29,60,94,75,36,48,...,86,94,58,30,76,10,0,40,79,85
3,11,7,89,34,18,30,48,87,88,83,...,6,70,11,7,49,35,10,37,39,41
4,15,73,55,40,24,41,78,3,66,33,...,20,40,4,86,79,32,20,19,13,58


## Normalizing (Preprocessing Deep Cols)

In [23]:
scaler = GaussRankScaler()
X_gauss = scaler.fit_transform(X)

  and should_run_async(code)


In [24]:
X.head()

  and should_run_async(code)


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.106628,3.59375,132.75,3.183594,0.08197,1.188477,3.732422,2.265625,2.099609,0.012329,...,0.010742,1.098633,0.013329,-0.011719,0.052765,0.06543,4.210938,1.978516,0.085999,0.240479
1,0.125,1.673828,76.5625,3.378906,0.099426,5.09375,1.275391,-0.471436,4.546875,0.03772,...,0.135864,3.460938,0.017059,0.124878,0.154053,0.606934,-0.267822,2.578125,-0.020874,0.024719
2,0.036316,1.49707,233.5,2.195312,0.026917,3.126953,5.058594,3.849609,1.801758,0.057007,...,0.11731,4.882812,0.085205,0.03241,0.116089,-0.001689,-0.52002,2.140625,0.124451,0.148193
3,-0.014076,0.245972,780.0,1.890625,0.006947,1.53125,2.697266,4.515625,4.503906,0.123474,...,-0.01535,3.474609,-0.017105,-0.008102,0.062012,0.041199,0.511719,1.96875,0.040009,0.044861
4,-0.00326,3.714844,156.125,2.148438,0.01828,2.097656,4.15625,-0.038239,3.371094,0.03418,...,0.013779,1.910156,-0.042938,0.105591,0.125122,0.037506,1.043945,1.075195,-0.012817,0.072815


In [25]:
X_gauss = pd.DataFrame(X_gauss, columns=X.columns, index=X.index)

  and should_run_async(code)


In [26]:
X_gauss.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.503864,1.170148,0.497627,1.061322,0.497063,0.697261,1.178046,0.918892,0.89803,0.26461,...,0.313791,0.697968,0.228935,-0.745756,0.410898,0.511767,1.301817,0.886265,0.491337,0.715292
1,0.532606,0.817177,0.414955,1.09994,0.523378,1.430099,0.707201,-1.001742,1.361183,0.395362,...,0.712643,1.133297,0.258559,0.941823,0.550487,0.937882,-0.799881,0.975537,-0.595291,0.375001
2,0.364383,0.777697,0.581712,0.902238,0.358373,1.046707,1.450572,1.213861,0.833888,0.448396,...,0.684576,1.420673,0.444982,0.597667,0.51478,-0.345367,-1.076664,0.911155,0.547897,0.627946
3,-0.552709,0.361766,0.800397,0.847488,0.20525,0.767239,0.975344,1.338971,1.352906,0.554931,...,-0.698743,1.136185,-0.622266,-0.67141,0.436283,0.439042,0.506585,0.883681,0.397615,0.451311
4,-0.357576,1.199284,0.516476,0.895768,0.313432,0.890002,1.273778,-0.348193,1.110106,0.385873,...,0.349392,0.873186,-0.772866,0.873466,0.528289,0.427691,0.67996,0.684102,-0.528382,0.520949


# WideDeep setup

In [3]:
def trainer(model, X_wide = X_bins, X_deep=X_gauss, y=y, exmodel_config=exmodel_config, wandb_config=wandb_config, random_state=42, wandb_tracked=True):
    """
    Simple trainer wrapper for widedeep models, with holdout
    """
    # concatenate together wide and deep data
    X = X_wide.join(X_deep)
    
    wide = Wide()
    
    deeptabular = TabMlp(
        mlp_hidden_dims=[64,32],
        continuous_cols=X_deep.columns,
    # scaling with GaussRankScaler, before doing holdout split
    # scaler = GaussRankScaler(X)
    # scaler.fit_transform(X)
    
    # skipping denoising for now
    
    # holdout split
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)    
    
    if wandb_tracked:
        exmodel_config['arch'] = arch
        exmodel_config[f'{arch}_params'] = str(params)
        wandb.init(
            project="202111_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )

SyntaxError: invalid syntax (<ipython-input-3-ef23d42afe3c>, line 3)

In [17]:
saint_model = TabAttention(categories=None, num_continuous=X.shape[1], dim=)

  and should_run_async(code)


TypeError: __init__() missing 5 required keyword-only arguments: 'categories', 'num_continuous', 'dim', 'depth', and 'heads'

In [34]:
def cross_validate_pytorch_model(arch:str, X, y, X_test, params:dict={}, start_fold=0, 
                         exmodel_config=exmodel_config, wandb_config=wandb_config, 
                         random_state=42, shuffle_kfolds=True, wandb_tracked=True, encode_cats=False):
    """
    Modification of the `cross_validate_model` function used in my stacking notebooks, customized to the dataset and to deep learning approaches.
    """
    
    # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    # shuffle on the initial sets, but not subsequently -- performing the same operation twice means a very different dataset
    if shuffle_kfolds:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    else:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)
    
    if wandb_tracked:
        exmodel_config['arch'] = arch
        exmodel_config[f'{arch}_params'] = str(params)
        wandb.init(
            project="202111_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    test_preds = np.zeros((X_test.shape[0]))
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#         print(f"type(train_ids) = {type(train_ids)} and train_ids.shape = {train_ids.shape}")
#         print(f"type(valid_ids) = {type(valid_ids)} and train_ids.shape = {valid_ids.shape}")
        if fold < start_fold: # skip folds that are already trained
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            y_train, y_valid = y[train_ids], y[valid_ids] # y will be an np.ndarray already; handling will be same regardless of model
            print(f"y_train shape is {y_train.shape}, y_valid shape is {y_valid.shape}")
            if isinstance(X, np.ndarray):
                X_train, X_valid = X[train_ids], X[valid_ids]
                # X_train = pd.DataFrame(X_train, columns=
            else:
                X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:] # bc need pandas.DataFrames for ce
            
            # print(f"X_train shape is {X_train.shape}")
            # print(f"X_valid shape is {X_valid.shape}")
            # print(f"X_test shape is {X_test.shape}")
            
            # scaling
            # scaler = GaussRankScaler()
            # X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
            # X_valid = pd.DataFrame(scaler.transform(X_valid), columns=X.columns)
            # X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
            
            # print("Scaling complete")
            # print(f"X_train shape is {X_train.shape}")
            # print(f"X_valid shape is {X_valid.shape}")
            # print(f"X_test shape is {X_test.shape}")
            
            # embedding & library-specific preprocessing
            tab_preprocessor = TabPreprocessor(
                # scale=False, # because GaussRank scaling already occurred
                scale=True
                for_transformer=False, # change if using a Transformer-based model
                continuous_cols=X.columns,
                # continuous_cols=range(X.shape[1]), # since it'll be working on a numpy.ndarray
                auto_embed_dim=True, # uses fastai's rule of thumb
            )#, embed_cols=embed_cols, )
            X_train = tab_preprocessor.fit_transform(X_train)   
            X_valid = tab_preprocessor.transform(X_valid)
            X_test = tab_preprocessor.transform(X_test)
            
            print("Tab preprocessing complete.")
            print(f"Type of X_train is {type(X_train)}")
            # print(f"X_train shape is {X_train.shape}")
            # print(f"X_valid shape is {X_valid.shape}")
            # print(f"X_test shape is {X_test.shape}")
            
            # define model
            deeptabular = TabMlp(
                mlp_hidden_dims=[64,32],
                column_idx=tab_preprocessor.column_idx,
            #     embed_input=tab_preprocessor.embeddings_input,
                # continuous_cols=range(X.shape[1]), # since it'll be working on a numpy.ndarray
                continuous_cols=X.columns,
            )

            n_epochs = 30

            model = WideDeep(wide=None, deeptabular=deeptabular)

            # pytorch hyperparams
            deep_opt = AdamW(model.parameters(), lr=0.1)

            # deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_train_tab.shape[0], epochs=n_epochs)

            # optimizers = {'deeptabular': deep_opt }
            # lr_schedulers = {'deeptabular': deep_sch }


            callbacks = [
                LRHistory(n_epochs=n_epochs), 
            ]



            # trainer
            trainer = Trainer(model=model, 
                              objective='binary', 
                              metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                              seed=random_state, 
                              optimizers=deep_opt,
                              callbacks=callbacks
                             )

    #             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
            trainer.fit( # this is where problem is beginning
                # X_wide=X_train_wide,
                X_tab=np.array(X_train),
                target=np.array(y_train),
                n_epochs=n_epochs,
                batch_size=1024, # default value is 32
    #                 val_split=0.2, # no need for this
            )

            y_valid_preds = trainer.predict_proba(X_tab=np.array(X_valid), batch_size=1024)[:,1]

            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)


            # test set inference
            fold_test_preds = trainer.predict_proba(X_tab=np.array(X_test), batch_size=1024)[:,1]
            test_preds += fold_test_preds
            
            print(f"NaNs in y_valid_preds: {np.isnan(y_valid_preds).any()}")
            print(f"NaNs in y_valid: {np.isnan(y_valid).any()}")
        
        
       
        
        

    #         valid_loss = log_loss(y_valid, y_pred)
            # give the valid AUC score, for edification
            fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
            if wandb_tracked:
                wandb.log({f'fold{fold}_valid_roc_auc': fold_valid_auc})
            print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
        # dump(model, Path(runpath/f"{arch}_fold{fold}_rs{random_state}_model.joblib"))

    model_valid_auc = roc_auc_score(oof_y, oof_preds)
    print(f"Valid AUC score for {arch} model is {model_valid_auc}")
    if wandb_tracked:
        wandb.log({'overall_valid_auc': model_valid_auc,
                   'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()),
                   'model_seed': random_state,
                  })
        wandb.finish()
    
    # finalize test preds
    test_preds /= exmodel_config['kfolds']
    
    # save OOF preds and test-set preds
#     if 'widedeep' in arch:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_test_preds.joblib"))
    
#     else:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    
    if not (datapath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib").is_file():
        dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
#     if wandb_tracked:
# #         if 'widedeep' in arch:
#         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
#                    'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()), 
#         #                    'model_params': str(model.get_params()),
#         })
# #         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
# # #                    'model_params': str(model.get_params()),
# #                   })
#         wandb.finish()
    return oof_preds, test_preds
        

  and should_run_async(code)


SyntaxError: invalid syntax (<ipython-input-34-5c6737837e8e>, line 68)

In [35]:
oof_preds, test_preds = cross_validate_pytorch_model('widedeep-TabMLP', X, y, X_test, wandb_tracked=False)

FOLD 0
---------------------------------------------------
y_train shape is (480000,), y_valid shape is (120000,)


epoch 1:   2%|▏         | 8/469 [00:00<00:06, 72.72it/s, loss=0.852, metrics={'acc': 0.498}] 

Tab preprocessing complete.
Type of X_train is <class 'numpy.ndarray'>


epoch 1: 100%|██████████| 469/469 [00:05<00:00, 78.36it/s, loss=0.647, metrics={'acc': 0.6435}]
epoch 2: 100%|██████████| 469/469 [00:06<00:00, 75.90it/s, loss=0.64, metrics={'acc': 0.6527}] 
epoch 3: 100%|██████████| 469/469 [00:06<00:00, 68.60it/s, loss=0.645, metrics={'acc': 0.6499}]
epoch 4: 100%|██████████| 469/469 [00:06<00:00, 76.83it/s, loss=0.648, metrics={'acc': 0.6462}]
epoch 5: 100%|██████████| 469/469 [00:06<00:00, 76.18it/s, loss=0.648, metrics={'acc': 0.6472}]
epoch 6: 100%|██████████| 469/469 [00:06<00:00, 74.80it/s, loss=0.645, metrics={'acc': 0.65}]  
epoch 7: 100%|██████████| 469/469 [00:05<00:00, 78.54it/s, loss=0.656, metrics={'acc': 0.634}] 
epoch 8: 100%|██████████| 469/469 [00:06<00:00, 72.74it/s, loss=0.693, metrics={'acc': 0.5112}]
epoch 9: 100%|██████████| 469/469 [00:06<00:00, 74.71it/s, loss=0.691, metrics={'acc': 0.5116}]
epoch 10: 100%|██████████| 469/469 [00:06<00:00, 76.26it/s, loss=0.699, metrics={'acc': 0.5246}]
epoch 11: 100%|██████████| 469/469 [00:

NaNs in y_valid_preds: True
NaNs in y_valid: False


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').