Header for notebooks -- customize as required.

In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"sweep_saint_{datetime.now().strftime('%Y%m%d')}.ipynb"

Now, non-stdlib imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

# from BorutaShap import BorutaShap
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
from optuna.samplers import TPESampler
from sklearn.utils import resample

In [5]:
from cleanlab.classification import LearningWithNoisyLabels

In [6]:
from gauss_rank_scaler import GaussRankScaler

In [7]:
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT#, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from torchmetrics import AUROC
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [8]:
# import category_encoders as ce

  and should_run_async(code)


Now, datapath setup

In [9]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    studypath = root/'studies'
    
    for pth in [datapath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


## Helpers

In [10]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

In [11]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

## Metadata

In [12]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    # 'train_source': str(datapath/'X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'train_source': str(datapath/'X_orig.feather'),
    'target_source': str(datapath/'y_corrected.joblib'),
    # 'test_source': str(datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    # 'test_source': str(datapath/'X_test_orig-no_scaling.feather'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
# X = load(dataset_params['train_source'])
X = pd.read_feather(dataset_params['train_source'])
y = load(dataset_params['target_source'])

X = reduce_memory_usage(X, verbose=True)
# X_test = load(dataset_params['test_source'])
# X_test = pd.read_feather(dataset_params['test_source'])

# dataset_params['feature_count'] = X.shape[1]
# dataset_params['instance_count'] = X.shape[0]
    

Mem. usage decreased to 114.44 Mb (75.0% reduction)


In [13]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
# in the sweep version, this includes both ex-model parameters and defaults for model parameters
exmodel_config = {
    "arch": 'widedeep-SAINT',
    "type": 'sweep',
    # "denoising": "cleanlab",
    "level": 1,
    'random_state': SEED,
    'tuner': "Optuna",
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
    'scaler': str(GaussRankScaler(epsilon=0.005)),
    'binning': 'pd.qcut(X.iloc[:,i],X.shape[1],labels=False)',
    **dataset_params
}

wandb_config = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202111_Kaggle_tabular_playground',
    'tags': ['sweep'],
    'notes': "SAINT sweep on 20% sample of corrected data, using holdout",
    'config': exmodel_config,
}

## Preprocessing Data
Inspired a bit by Laurent Pourchot's Aug2021 Tabular Playground entry, I'm going to try to generate two versions of the dataset: a categorical one, using bins, and then (for now) a GaussRankScaled one. In the future, I might add further variations, e.g. with feature reduction via PCA and perhaps also UMAP and also denoising; I might also try other normalizations, e.g. Quantile.

## Binning (Generating wide cols)

In [14]:
# h/t Laurent Pourchot https://www.kaggle.com/pourchot/in-python-tabular-denoising-residual-network/

# 100 bins for the bins head of the NN (i.e. percentiles):
X_bins = np.zeros((X.shape[0],X.shape[1])) # he used all available data for the first tuple entry, but I'll start like this
# X_test_bins = np.zeros((X_test.shape[0], X_test.shape[1]))

In [15]:
for i in range(X.shape[1]): # assumes X is a pd.DataFrame
    X_bins[:,i] = pd.qcut(X.iloc[:,i],X.shape[1],labels=False)#,duplicates = 'drop')
    
# for i in range(X_test.shape[1]): # assumes X_test is a pd.DataFrame
#     X_test_bins[:,i] = pd.qcut(X_test.iloc[:,i],X_test.shape[1], labels=False)#,duplicates = 'drop')
# blabeled = X_bins[:X.shape[0],:]
# bunlabeled = X_ins[X.shape[0]:,:]

In [16]:
# X_bins = X_bins.astype(np.int8)
# X_test_bins = X_test_bins.astype(np.int8)

In [17]:
X_bins = pd.DataFrame(X_bins, index=X.index, columns=[f'rkd_f{col}' for col in range(100)])
# X_test_bins = pd.DataFrame(X_test_bins, index=X_test.index, columns=[f'rkd_f{col}' for col in range(100)])

## Normalizing (Preprocessing Deep Cols)

In [18]:
scaler = GaussRankScaler(n_jobs=-1, epsilon=0.005)
X_gauss = scaler.fit_transform(X)
# X_test_gauss = scaler.transform(X_test)

In [19]:
X_gauss = pd.DataFrame(X_gauss, columns=X.columns, index=X.index)
# X_test_gauss = pd.DataFrame(X_test_gauss, columns=X_test.columns, index=X_test.index)

  and should_run_async(code)


## Preparing Data for WideDeep

In [20]:
X_pre = X_gauss.join(X_bins)
# X_test_pre = X_test_gauss.join(X_test_bins)

In [21]:
cont_cols = X_pre.iloc[:,:100].columns
wide_cols = X_pre.iloc[:, 100:].columns

In [22]:
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
X_wide = wide_preprocessor.fit_transform(X_pre)
# X_test_wide = wide_preprocessor.transform(X_test_pre)

In [23]:
tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols, scale=False, for_transformer=False,embed_cols=wide_cols, already_standard=True)
X_tab = tab_preprocessor.fit_transform(X_pre)
# X_test_tab = tab_preprocessor.transform(X_test_pre)



In [24]:
# trimming the sets down 
from sklearn.utils.random import sample_without_replacement

In [25]:
subsample = sample_without_replacement(X_wide.shape[0], int(X_wide.shape[0] * 0.2), random_state=42,)

In [26]:
subsample

array([  4242,  60608, 392832, ..., 583530, 285430, 391044])

In [27]:
X_wide = X_wide[subsample]
X_tab = X_tab[subsample]
y = y[subsample]

X_wide.shape

(120000, 100)

In [28]:
# del X_deep_train, X_deep_valid
X_wide_train, X_wide_valid, y_train, y_valid = train_test_split(X_wide, y, test_size=0.2, random_state=42)
X_tab_train, X_tab_valid, _, _ = train_test_split(X_tab, y, test_size=0.2, random_state=42)

In [29]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [30]:
from torch import optim

## Objective Function

In [31]:
def objective(trial, X_wide_train=X_wide_train, X_tab_train=X_tab_train, X_wide_valid=X_wide_valid, X_tab_valid=X_tab_valid, 
              y_train=y_train, y_valid=y_valid, cont_cols=cont_cols, tab_preprocessor=tab_preprocessor):
    # X = np.array(X)
    # y = np.array(y)
    wide = Wide(wide_dim=np.unique(X_wide_train).shape[0], pred_dim=1)
    
    # SAINT-specific params
    saint_params = {
        'embed_dropout': trial.suggest_float('embedding_dropout', 0.05, 0.2),
        'cont_norm_layer': trial.suggest_categorical('continuous_norm_layer', ['layernorm', 'batchnorm', None]),
        # 'input_dim': trial.suggest_categorical('dimension_of_input', [32, 40, 48, 56, 64, 72]),#, 96, 128]),
        # 'n_heads': trial.suggest_categorical('number_of_heads', [4, 8]),
        'use_bias': trial.suggest_categorical('use_bias_in_projection', [True, False]),
        #'n_blocks': trial.suggest_int('num_transformer_blocks', 2, 6),
        'attn_dropout': trial.suggest_float('attention_dropout', 0.1, 0.3),
        'ff_dropout': trial.suggest_float('feedforward_dropout', 0.05, 0.2),
        'transformer_activation': trial.suggest_categorical('transformer_activation_func', ['tanh', 'relu', 'leaky_relu', 'gelu', 'geglu', 'reglu']),
        'mlp_activation': trial.suggest_categorical('mlp_activation_func', ['tanh', 'relu', 'leaky_relu', 'gelu']),
        'mlp_dropout': trial.suggest_float('final_mlp_dropout', 0.05, 0.2),
        'mlp_batchnorm': trial.suggest_categorical('final_mlp_batchnorm', [True, False]),
        'mlp_batchnorm_last': trial.suggest_categorical('final_mlp_batchnorm_last_dense_layer', [True, False]),
    }
    
    print(saint_params)
    
    # other_params = { # being held back, but could be tested later, time permitting
    #     'shared_embed': trial.suggest_categorical('shared_embedding', [True, False]),
    #     'add_shared_embed': trial.suggest_categorical('add_shared_embedding', [True, False]),
    #     'frac_shared_embed': trial.suggest_float('fraction_shared_embeddings', 0.1, 0.35),
    #     'embed_continuous': trial.suggest_categorical('embed_continuous_cols', [True, False]),
    #     'mlp_hidden_dims': None,
    # }
        
    
    deeptabular = SAINT(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx, **saint_params)
    model = WideDeep(wide=wide, deeptabular=deeptabular)
    # deeptabular = TabMlp(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx)
    # model = WideDeep(wide=wide, deeptabular=deeptabular)
    
    n_epochs = 20
    
    optimizer_candidates = {
        'AdamW': optim.AdamW,
        'Adam': optim.Adam,
        'Adagrad': optim.Adagrad,
        'RMSprop': optim.RMSprop,
        # 'LBFGS': optim.LBFGS,
        'SGD': optim.SGD,
    } 
    
    # trial define trans-optimizer params
    lr = trial.suggest_float('learning_rate', 0.001, 0.3)
    wd = trial.suggest_loguniform('weight_decay', 0.00001, 0.1)
    
    # define optimizers for components
    wide_optimizer = trial.suggest_categorical('wide_optimization', optimizer_candidates.keys())
    tab_optimizer = trial.suggest_categorical('tab_optimization', optimizer_candidates.keys())
    wide_momentum = trial.suggest_float('wide_momentum', 0, 0.9)
    if wide_optimizer in ['RMSprop', 'SGD']:
        wide_opt = optimizer_candidates[wide_optimizer](model.wide.parameters(), lr=lr, weight_decay=wd, momentum=wide_momentum)
    else:
        wide_opt = optimizer_candidates[wide_optimizer](model.wide.parameters(), lr=lr, weight_decay=wd)
        
    tab_momentum = trial.suggest_float('tab_momentum', 0.7, 0.9)
    if tab_optimizer in ['RMSprop', 'SGD']:
        tab_opt = optimizer_candidates[tab_optimizer](model.deeptabular.parameters(), lr=lr, weight_decay=wd, momentum=tab_momentum)
    else:
        tab_opt = optimizer_candidates[tab_optimizer](model.deeptabular.parameters(), lr=lr, weight_decay=wd)
    
    # trial define trans-scheduler params
    wide_scheduler_candidates = {'CosineAnnealingLR': optim.lr_scheduler.CosineAnnealingLR(optimizer=wide_opt, T_max=10),
                                 'ReduceLROnPlateau': optim.lr_scheduler.ReduceLROnPlateau(optimizer=wide_opt,),
                                 # optim.lr_scheduler.CyclicLR(optimizer=wide_opt, base_lr=0.001, max_lr=0.3, cycle_momentum=True),
                                 # 'OneCycleLR': optim.lr_scheduler.OneCycleLR(optimizer=wide_opt, max_lr=0.3, steps_per_epoch=len(X_wide_train), epochs=n_epochs),
                                 'CosineAnnealingWarmRestarts': optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer=wide_opt, T_0=5)
                                }
    
    #print(wide_scheduler_candidates.keys())
    
    tab_scheduler_candidates = { 'CosineAnnealingLR': optim.lr_scheduler.CosineAnnealingLR(optimizer=tab_opt, T_max=10),
                                 'ReduceLROnPlateau': optim.lr_scheduler.ReduceLROnPlateau(optimizer=tab_opt,),
                                 # optim.lr_scheduler.CyclicLR(optimizer=tab_opt, base_lr=0.001, max_lr=0.3, cycle_momentum=True),
                                 # 'OneCycleLR': optim.lr_scheduler.OneCycleLR(optimizer=tab_opt, max_lr=0.3, steps_per_epoch=len(X_wide_train), epochs=n_epochs),
                                 'CosineAnnealingWarmRestarts': optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer=tab_opt, T_0=5)
                               }
        
    wide_scheduler = trial.suggest_categorical('wide_scheduling', wide_scheduler_candidates.keys())
    # OneCycleLR(optimizer=wide_opt, max_lr=0.01, steps_per_epoch=X_wide_train.shape[0], epochs=n_epochs)
    tab_scheduler = trial.suggest_categorical('tab_scheduling', tab_scheduler_candidates.keys())
    # OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_tab_train.shape[0], epochs=n_epochs)

    wide_sch = wide_scheduler_candidates[wide_scheduler]
    deep_sch = tab_scheduler_candidates[tab_scheduler] # note the name shift
    
    optimizers = {'wide': wide_opt, 'deeptabular': tab_opt }
    lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }


    callbacks = [
        LRHistory(n_epochs=n_epochs), 
    ]

    # trainer
    trainer = Trainer(model=model, 
                      objective='binary', 
                      metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                      seed=42, 
                      optimizers=optimizers,
                      callbacks=callbacks
                     )

#             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
    trainer.fit( # this is where problem is beginning
        X_wide=X_wide_train,
        X_tab=X_tab_train,
        target=y_train,
        n_epochs=n_epochs,
        batch_size=4, # default value is 32
    #                 val_split=0.2, # no need for this
    )

    preds = trainer.predict_proba(X_wide=X_wide_valid, X_tab=X_tab_valid, batch_size=4)[:,1]
    # preds = rp.predict_proba(X_valid)[:,1]
    valid_auc = roc_auc_score(y_true=y_valid, y_score=preds)
    print(f"Valid AUC score for is {valid_auc}")
    return valid_auc
    

In [32]:
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_config)

  wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_config)
[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [33]:
del X_pre, X, X_gauss, X_bins

In [34]:
study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed=int(SEED)), study_name='widedeep-saint-20211126')
# study = load()

[32m[I 2021-11-26 18:48:58,102][0m A new study created in memory with name: widedeep-saint-20211126[0m


In [35]:
# dump(study, filename=studypath/f'optuna-widedeep_tabmlp-20211124.joblib')
for x in range(500):
    torch.cuda.empty_cache()
    print(datetime.now().strftime('%Y%m%d %H:%M:%S)'))
    study.optimize(objective, n_trials = 1, callbacks = [wandbc]) #n_jobs = multiprocessing.cpu_count())
    dump(study, filename=studypath/f'optuna-widedeep_saint-20211126.joblib')

# study.optimize(objective, n_trials=18, callbacks=[wandbc])

20211126 18:48:58)
{'embed_dropout': 0.10618101782710439, 'cont_norm_layer': 'layernorm', 'use_bias': True, 'attn_dropout': 0.1116167224336399, 'ff_dropout': 0.1799264218662403, 'transformer_activation': 'gelu', 'mlp_activation': 'gelu', 'mlp_dropout': 0.11479175279631737, 'mlp_batchnorm': False, 'mlp_batchnorm_last': False}


epoch 1:  56%|█████▋    | 13518/24000 [17:20<13:26, 13.00it/s, loss=nan, metrics={'acc': 0.0006}]  


KeyboardInterrupt: 

In [None]:
wandb.finish()

In [None]:
study.best_params

In [None]:
dump(study, studypath/f'optuna-widedeep_tabmlp-20211124.joblib') 