In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random
import gc; gc.enable()

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"deeptrainer_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
# from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

# from BorutaShap import BorutaShap
from gauss_rank_scaler import GaussRankScaler

In [5]:
from SAINT import TabAttention # from the official SAINT implementation as of 20211118, https://github.com/somepago/saint/blob/main/models/model.py

In [6]:
import torchinfo
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from torchmetrics import AUROC
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [7]:
# import category_encoders as ce

In [8]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [datapath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    

In [9]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

In [10]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [11]:
exmodel_config = {'arch': 'widedeep-tabmlp',}

In [12]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    'train_source': str(datapath/'X_orig.feather'),
    'target_source': str(datapath/'y_corrected.joblib'),
    'test_source': str(datapath/'X_test_orig-no_scaling.feather'),
    'scaler': str(GaussRankScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
X = pd.read_feather(dataset_params['train_source'])# load(dataset_params['train_source'])
y = load(dataset_params['target_source'])
X_test = pd.read_feather(dataset_params['test_source']) #load(dataset_params['test_source'])

# dataset_params['feature_count'] = X.shape[1]
# dataset_params['instance_count'] = X.shape[0]

In [13]:
# decrease memory footprint
X = reduce_memory_usage(X)
X_test = reduce_memory_usage(X_test)

In [14]:
from sklearn.decomposition import PCA
import umap

In [15]:
preprocessing_params = {
    'binning': "pd.qcut(X.iloc[:,i],X.shape[1],labels=False,duplicates = 'drop')",
    'scaling, normalization': str(GaussRankScaler(epsilon=0.005)),
    'reduction': str(PCA(n_components='mle', random_state=42)),
    'manifold': str(umap.UMAP(n_components=10, n_neighbors=15, random_state=42, transform_seed=42,)),
    'clustering': None,
}

In [16]:
X_pre = pd.read_feather(datapath/'X_bins+GaussRankScaled+PCA,UMAP.feather')

In [17]:
dataset_params['feature_count'] = X_pre.shape[1]
dataset_params['instance_count'] = X_pre.shape[0]

In [18]:
if 'widedeep' in exmodel_config['arch']:
    cont_cols = X_pre.iloc[:,:110].columns
    wide_cols = X_pre.iloc[:, 110:].columns
    wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
    X_wide = wide_preprocessor.fit_transform(X_pre)
    # X_wide_test = wide_preprocessor.transform(X_pre_test)
    # tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols, scale=False, for_transformer=False,embed_cols=wide_cols)
    tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols, scale=False, for_transformer=False,embed_cols=wide_cols)
    X_tab = tab_preprocessor.fit_transform(X_pre)
    # X_tab_test = tab_preprocessor.transform(X_pre_test)
    

In [19]:
widedeep_preprocessing_params = {
        'wide': str(wide_preprocessor),
        'deeptabular': str(tab_preprocessor),
    }
    
preprocessing_params.update(widedeep_preprocessing_params)
print(preprocessing_params)

In [20]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config.update({
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
#     'random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
    **dataset_params,
    **preprocessing_params
#     'features_created': False,
#     'feature_creator': None,
})

In [21]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['widedeep', 'deeplearning'],
    'notes': "Part of trials to decide dataset inputs for widedeep models."
}

In [22]:
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)

In [23]:
deeptabular = TabMlp(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx)
# deeptabular = SAINT(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx,)

In [24]:
model = WideDeep(wide=wide, deeptabular=deeptabular)
# model.load_state_dict(torch.load(datapath/"saint_20211121_weights_25epochs/wd_model.pt"))

In [25]:
# del X_deep_train, X_deep_valid
X_wide_train, X_wide_valid, y_train, y_valid = train_test_split(X_wide, y, test_size=0.2, random_state=42)
X_tab_train, X_tab_valid, _, _ = train_test_split(X_tab, y, test_size=0.2, random_state=42)

In [26]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [27]:
exmodel_config['training_params'] = str({'learning_rate': 0.08763568442121664, 'weight_decay': 4.414536876494478e-05, 'wide_optimization': 'AdamW', 'tab_optimization': 'SGD', 'wide_momentum': 0.2862031274746775, 'tab_momentum': 0.7220103849055353, 'wide_scheduler': 'CosineAnnealingWarmRestarts', 'tab_scheduler': 'ReduceLROnPlateau'})

In [28]:
wandb.init(
            project="202111_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )

<wandb.sdk.wandb_run.Run at 0x7f4c883d9970>

In [29]:
n_epochs = 20

lr = 0.08763568442121664
wd = 4.414536876494478e-05

wide_opt = AdamW(model.wide.parameters(), lr=lr)
deep_opt = SGD(model.deeptabular.parameters(), lr=lr, weight_decay=wd, momentum=0.7220103849055353)

# wide_sch = OneCycleLR(optimizer=wide_opt, max_lr=0.01, steps_per_epoch=X_wide_train.shape[0], epochs=n_epochs)
# deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_tab_train.shape[0], epochs=n_epochs)
wide_sch = CosineAnnealingWarmRestarts(optimizer=wide_opt, T_0=5) 
deep_sch = ReduceLROnPlateau(optimizer=deep_opt, )

optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }

callbacks = [
    LRHistory(n_epochs=n_epochs), 
]

In [30]:
# torch.cuda.empty_cache()

In [31]:
# trainer
trainer = Trainer(model=model, 
                  objective='binary', 
                  metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                  seed=42, 
                  optimizers=optimizers,
                  callbacks=callbacks
                 )

#             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
trainer.fit( 
    X_wide=X_wide_train,
    X_tab=X_tab_train,
    target=y_train,
    n_epochs=n_epochs,
    batch_size=1024, # default value is 32; 1024 works for TabMLP
#                 val_split=0.2, # no need for this
)

y_valid_preds = trainer.predict_proba(X_wide=X_wide_valid, X_tab=X_tab_valid, batch_size=1024)[:,1]
           
    

In [32]:
trainer.save(path=datapath/'widedeep_tabmlp-202111271032-weights-20epochs', save_state_dict=True)

In [33]:
valid_auc = roc_auc_score(y_score=y_valid_preds, y_true=y_valid)

In [34]:
valid_auc

0.9834755416178064

In [35]:
wandb.log({'overall_valid_auc': valid_auc})

In [36]:
wandb.finish()