Header for notebooks -- customize as required.

In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"widedeep_corrected_{datetime.now().strftime('%Y%m%d')}.ipynb"

Now, non-stdlib imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

# from BorutaShap import BorutaShap
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
from optuna.samplers import TPESampler
from sklearn.utils import resample

In [5]:
from cleanlab.classification import LearningWithNoisyLabels

In [6]:
from gauss_rank_scaler import GaussRankScaler

In [7]:
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep#, SAINT, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from torchmetrics import AUROC
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [8]:
# import category_encoders as ce

  and should_run_async(code)


Now, datapath setup

In [9]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [datapath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


## Helpers

In [10]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

In [11]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

## Metadata

In [12]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    # 'train_source': str(datapath/'X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'train_source': str(datapath/'X_orig.feather'),
    'target_source': str(datapath/'y_corrected.joblib'),
    # 'test_source': str(datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'test_source': str(datapath/'X_test_orig-no_scaling.feather'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
# X = load(dataset_params['train_source'])
X = pd.read_feather(dataset_params['train_source'])
y = load(dataset_params['target_source'])
# X_test = load(dataset_params['test_source'])
X_test = pd.read_feather(dataset_params['test_source'])

dataset_params['feature_count'] = X.shape[1]
dataset_params['instance_count'] = X.shape[0]
    

In [13]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
# in the sweep version, this includes both ex-model parameters and defaults for model parameters
exmodel_config = {
    "arch": 'widedeep-TabMLP',
    # "type": 'sweep',
    # "denoising": "cleanlab",
    "level": 1,
    'random_state': SEED,
    # 'tuner': "Optuna",
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
    'scaler': str(GaussRankScaler()),
    **dataset_params
}

wandb_config = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202111_Kaggle_tabular_playground',
    'tags': ['experiment'],
    'notes': "Comparing.",
    'config': exmodel_config,
}

# Noisy Run

## Preprocessing Data
Inspired a bit by Laurent Pourchot's Aug2021 Tabular Playground entry, I'm going to try to generate two versions of the dataset: a categorical one, using bins, and then (for now) a GaussRankScaled one. In the future, I might add further variations, e.g. with feature reduction via PCA and perhaps also UMAP and also denoising; I might also try other normalizations, e.g. Quantile.

## Binning (Generating wide cols)

In [14]:
# h/t Laurent Pourchot https://www.kaggle.com/pourchot/in-python-tabular-denoising-residual-network/

# 100 bins for the bins head of the NN (i.e. percentiles):
X_bins = np.zeros((X.shape[0],X.shape[1])) # he used all available data for the first tuple entry, but I'll start like this
X_test_bins = np.zeros((X_test.shape[0], X_test.shape[1]))

In [15]:
for i in range(X.shape[1]): # assumes X is a pd.DataFrame
    X_bins[:,i] = pd.qcut(X.iloc[:,i],X.shape[1],labels=False)#,duplicates = 'drop')
    
for i in range(X_test.shape[1]): # assumes X_test is a pd.DataFrame
    X_test_bins[:,i] = pd.qcut(X_test.iloc[:,i],X_test.shape[1], labels=False)#,duplicates = 'drop')
# blabeled = X_bins[:X.shape[0],:]
# bunlabeled = X_ins[X.shape[0]:,:]

In [16]:
np.isnan(X_test_bins).any()

False

In [17]:
# X_bins = X_bins.astype(np.int8)
# X_test_bins = X_test_bins.astype(np.int8)

In [18]:
X_bins = pd.DataFrame(X_bins, index=X.index, columns=[f'rkd_f{col}' for col in range(100)])
X_test_bins = pd.DataFrame(X_test_bins, index=X_test.index, columns=[f'rkd_f{col}' for col in range(100)])

## Normalizing (Preprocessing Deep Cols)

In [19]:
scaler = GaussRankScaler(n_jobs=-1, epsilon=0.005)
X_gauss = scaler.fit_transform(X)
X_test_gauss = scaler.transform(X_test)

In [20]:
np.where(np.isnan(X_test_gauss))

  and should_run_async(code)


(array([], dtype=int64), array([], dtype=int64))

In [21]:
X_gauss = pd.DataFrame(X_gauss, columns=X.columns, index=X.index)
X_test_gauss = pd.DataFrame(X_test_gauss, columns=X_test.columns, index=X_test.index)

## Preparing Data for WideDeep

In [22]:
X_pre = X_gauss.join(X_bins)
X_test_pre = X_test_gauss.join(X_test_bins)

In [23]:
cont_cols = X_pre.iloc[:,:100].columns
wide_cols = X_pre.iloc[:, 100:].columns

In [24]:
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
X_wide = wide_preprocessor.fit_transform(X_pre)
X_test_wide = wide_preprocessor.transform(X_test_pre)

In [25]:
tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols, scale=False, for_transformer=False,embed_cols=wide_cols, already_standard=True)
X_tab = tab_preprocessor.fit_transform(X_pre)
X_test_tab = tab_preprocessor.transform(X_test_pre)



In [26]:
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)

In [27]:
# deeptabular = TabMlp(continuous_cols=X_gauss.columns, column_idx=tab_preprocessor.column_idx)
deeptabular = TabMlp(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx)

In [28]:
model = WideDeep(wide=wide, deeptabular=deeptabular)

In [29]:
# del X_deep_train, X_deep_valid
X_wide_train, X_wide_valid, y_train, y_valid = train_test_split(X_wide, y, test_size=0.2, random_state=42)
X_tab_train, X_tab_valid, _, _ = train_test_split(X_tab, y, test_size=0.2, random_state=42)

In [30]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [None]:
n_epochs = 50

wide_opt = AdamW(model.wide.parameters(), lr=0.1)
deep_opt = AdamW(model.deeptabular.parameters(), lr=0.1)

wide_sch = OneCycleLR(optimizer=wide_opt, max_lr=0.01, steps_per_epoch=X_wide_train.shape[0], epochs=n_epochs)
deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_tab_train.shape[0], epochs=n_epochs)

optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }


callbacks = [
    LRHistory(n_epochs=n_epochs), 
]

# trainer
trainer = Trainer(model=model, 
                  objective='binary', 
                  metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                  seed=42, 
                  optimizers=optimizers,
                  callbacks=callbacks
                 )

#             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
trainer.fit( # this is where problem is beginning
    X_wide=X_wide_train,
    X_tab=X_tab_train,
    target=y_train,
    n_epochs=n_epochs,
    batch_size=1024, # default value is 32
#                 val_split=0.2, # no need for this
)

y_valid_preds = trainer.predict_proba(X_wide=X_wide_valid, X_tab=X_tab_valid, batch_size=1024)[:,1]
           
    

epoch 1: 100%|██████████| 469/469 [00:05<00:00, 81.93it/s, loss=1.13, metrics={'acc': 0.7708}] 
epoch 2: 100%|██████████| 469/469 [00:05<00:00, 89.98it/s, loss=0.845, metrics={'acc': 0.8317}] 
epoch 3: 100%|██████████| 469/469 [00:05<00:00, 88.96it/s, loss=0.788, metrics={'acc': 0.8371}]
epoch 4: 100%|██████████| 469/469 [00:05<00:00, 88.59it/s, loss=0.732, metrics={'acc': 0.841}]  
epoch 5: 100%|██████████| 469/469 [00:05<00:00, 88.14it/s, loss=0.685, metrics={'acc': 0.8446}]
epoch 6: 100%|██████████| 469/469 [00:05<00:00, 88.78it/s, loss=0.642, metrics={'acc': 0.8478}] 
epoch 7: 100%|██████████| 469/469 [00:05<00:00, 88.03it/s, loss=0.597, metrics={'acc': 0.8519}]
epoch 8: 100%|██████████| 469/469 [00:05<00:00, 88.77it/s, loss=0.562, metrics={'acc': 0.854}]  
epoch 9: 100%|██████████| 469/469 [00:05<00:00, 88.08it/s, loss=0.523, metrics={'acc': 0.8577}]
epoch 10: 100%|██████████| 469/469 [00:05<00:00, 88.69it/s, loss=0.49, metrics={'acc': 0.8606}]  
epoch 11: 100%|██████████| 469/469

In [None]:
dirty_preds = trainer.predict_proba(X_wide=X_test_wide, X_tab=X_test_tab, batch_size=1024,)[:,1]
np.isnan(dirty_preds).any()

In [None]:
dump(dirty_preds, predpath/'cleanlab_widedeep_20211123-TabMLP-dirtydata-holdout-baseline_preds.joblib')

In [None]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [None]:
sample_df.head()

In [None]:
sample_df.loc[:, 'target'] = dirty_preds

In [None]:
sample_df.head()

In [55]:
sample_df.to_csv(subpath/f"{wandb_config['name']}-TabMLP-corrected_data-holdout-baseline_preds.csv", index=False)
# sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-X_orig+KMeans8+synth-GBM-stack_ensemble_preds.csv", index=False)

  and should_run_async(code)


In [60]:
wandb.init(
    project="202111_Kaggle_tabular_playground",
    save_code=True,
    tags=wandb_config['tags'],
    name=wandb_config['name'],
    notes=wandb_config['notes'],
    config=exmodel_config
) 

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [68]:
import torchinfo


  and should_run_async(code)


In [70]:
summary = torchinfo.summary(model)

In [72]:
str(summary)



In [74]:
wandb.log({'overall_valid_auc': roc_auc_score(y_true=y_valid, y_score=y_valid_preds),
           'model_params': model.parameters,
           'model_summary': str(torchinfo.summary(model)),
           'model_seed': 42,
           'leadboard_auc': 0.72913,
          })

  and should_run_async(code)


In [75]:
wandb.finish()

  and should_run_async(code)


VBox(children=(Label(value=' 0.35MB of 0.35MB uploaded (0.10MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
leadboard_auc,▁
model_seed,▁
overall_valid_auc,▁

0,1
leadboard_auc,0.72913
model_params,torch.nn.modules.mod...
model_seed,42
model_summary,====================...
overall_valid_auc,0.74197


# CleanLab version

In [58]:
del X_gauss, X_test_gauss, X_pre, X_test_pre

In [59]:
# trainer
clean_trainer = Trainer(model=model, 
                  objective='binary', 
                  metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                  seed=42, 
                  optimizers=optimizers,
                  callbacks=callbacks
                 )

In [76]:
rp_params = {
    # 'prune_method': trial.suggest_categorical('prune_method', ['prune_by_noise_rate', 'prune_by_class', 'both']),
    # 'converge_latent_estimates': trial.suggest_categorical('converge_latent_estimates', [True, False]),
    # 'pulearning': trial.suggest_categorical('pulearning', [0,1,None])
    'prune_method': 'both',
    'converge_latent_estimates': True,
    'pulearning': 1,
}

In [78]:
Trainer.__init__?

[0;31mSignature:[0m
[0mTrainer[0m[0;34m.[0m[0m__init__[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0mpytorch_widedeep[0m[0;34m.[0m[0mmodels[0m[0;34m.[0m[0mwide_deep[0m[0;34m.[0m[0mWideDeep[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mobjective[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcustom_loss_function[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mmodules[0m[0;34m.[0m[0mmodule[0m[0;34m.[0m[0mModule[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moptimizers[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0moptim[0m[0;34m.[0m[0moptimizer[0m[0;34m.[0m[0mOptimizer[0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mtorch[0m[0;34m.[0m[0moptim[0m[0;34m.[0m[0moptimizer[0m[0;34m.[0m[0mOptimize

In [None]:
from sklearn.base import BaseEstimator
class MyTabMLP(BaseEstimator): # Inherits sklearn base classifier
    def __init__(self, ):
        Trainer.__init__()
    def fit(self, X, y, sample_weight = None):
        X['wide']
    def predict(self, X):
        pass
    def predict_proba(self, X):
        pass
    def score(self, X, y, sample_weight = None):
        pass

In [None]:
rp = LearningWithNoisyLabels(clf=model, cv_n_folds=1, seed=42, **rp_params)
rp.fit(X_train, y_train)

In [None]:
#             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
clean_trainer.fit( # this is where problem is beginning
    X_wide=X_wide_train,
    X_tab=X_tab_train,
    target=y_train,
    n_epochs=50,
    batch_size=1024, # default value is 32
#                 val_split=0.2, # no need for this
)

y_valid_preds = trainer.predict_proba(X_wide=X_wide_valid, X_tab=X_tab_valid, batch_size=1024)[:,1]

In [41]:
dirty_preds.shape

(540000,)

In [42]:
sample_df.shape

(540000, 2)

In [43]:
dirty_preds_float64 = dirty_preds.astype(np.float64)

In [44]:
sample_df.loc[:, 'target'] = dirty_preds

In [45]:
sample_df.head()

Unnamed: 0,id,target
0,600000,0.781741
1,600001,0.547938
2,600002,0.714756
3,600003,0.340266
4,600004,0.561223


In [46]:
sample_df.to_csv(subpath/f"{wandb_config['name']}-TabMLP-dirtydata-holdout-baseline_preds_floatified.csv", index=False)


  and should_run_async(code)


## Clean Run

In [31]:
exmodel_config['denoising'] = 'cleanlab'

wandb_config = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202111_Kaggle_tabular_playground',
    'tags': ['experiment'],
    'notes': "Going to try CleanLab with the 'best' params from the previous sweep, but wanting to see what the actual LB score is, and how it compares to the noisy one. Using a default LGBMClassfier with holdout on robust-scaled original dataset.",
    'config': exmodel_config,
}

wandb.init(
    project="202111_Kaggle_tabular_playground",
    save_code=True,
    tags=wandb_config['tags'],
    name=wandb_config['name'],
    notes=wandb_config['notes'],
    config=exmodel_config
) 

[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [32]:
rp_params = {
    # 'prune_method': trial.suggest_categorical('prune_method', ['prune_by_noise_rate', 'prune_by_class', 'both']),
    # 'converge_latent_estimates': trial.suggest_categorical('converge_latent_estimates', [True, False]),
    # 'pulearning': trial.suggest_categorical('pulearning', [0,1,None])
    'prune_method': 'both',
    'converge_latent_estimates': True,
    'pulearning': 1,
}

In [33]:
lgb_model = LGBMClassifier(
        objective='binary',
        random_state=42,
        #                     device_type='cpu',
        #                     n_jobs=-1,
        #                 eval_metric='auc',
        device_type='gpu',
        max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
        gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
    )

In [34]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
scaler = RobustScaler()

In [36]:
scaler.fit_transform(X_train)

array([[-0.27296268,  0.01579322, -0.19031588, ...,  0.50363337,
         0.17720313, -0.58369832],
       [-0.18106382,  0.2410393 ,  5.30773161, ...,  0.24955035,
        -0.48450355, -0.62285755],
       [ 6.07300903, -0.85078822,  0.17280319, ..., -0.62764574,
        -0.67894852,  0.73726127],
       ...,
       [ 4.18856991, -0.68824246,  0.21225385, ...,  0.88726112,
         6.19955807, -0.09918299],
       [-0.300117  ,  0.13080398, -0.54802214, ..., -0.3979886 ,
        -0.01474   ,  0.22398866],
       [-0.29279129,  0.62179137,  0.07317624, ...,  0.14508993,
        12.038684  ,  0.6437292 ]])

In [37]:
scaler.transform(X_valid)

array([[ 1.27745959, -0.79095855,  3.57879036, ...,  0.40431588,
         0.30027418,  0.45467042],
       [-0.35032922, -0.36662758, -0.83368479, ...,  0.48734142,
        -0.59656634,  9.97028335],
       [ 2.07990909,  0.20997962, -0.0127639 , ..., -0.80240405,
        -0.22697866,  0.23615553],
       ...,
       [ 0.80633635,  0.39936366,  0.49845399, ...,  0.68680344,
        -1.04204429,  0.24502101],
       [-0.06863608, -0.27843164, -0.36283929, ...,  0.18702425,
         0.57010813, -0.05371796],
       [-0.3620719 , -0.30144763, -0.06982129, ..., -0.73907299,
        -1.0792864 ,  0.24365673]])

In [39]:
X_train = np.array(X_train)
X_valid = np.array(X_valid)
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [40]:
rp = LearningWithNoisyLabels(clf=lgb_model, **rp_params)
rp.fit(X_train, y_train)

LGBMClassifier(device_type='gpu', gpu_use_dp=False, max_bin=63,
               objective='binary', random_state=42)

In [41]:
preds = rp.predict_proba(X_valid)[:,1]
valid_auc = roc_auc_score(y_true=y_valid, y_score=preds)
valid_auc

0.7242336321944994

In [42]:
rp.get_params()

{'clf__boosting_type': 'gbdt',
 'clf__class_weight': None,
 'clf__colsample_bytree': 1.0,
 'clf__importance_type': 'split',
 'clf__learning_rate': 0.1,
 'clf__max_depth': -1,
 'clf__min_child_samples': 20,
 'clf__min_child_weight': 0.001,
 'clf__min_split_gain': 0.0,
 'clf__n_estimators': 100,
 'clf__n_jobs': -1,
 'clf__num_leaves': 31,
 'clf__objective': 'binary',
 'clf__random_state': 42,
 'clf__reg_alpha': 0.0,
 'clf__reg_lambda': 0.0,
 'clf__silent': 'warn',
 'clf__subsample': 1.0,
 'clf__subsample_for_bin': 200000,
 'clf__subsample_freq': 0,
 'clf__device_type': 'gpu',
 'clf__max_bin': 63,
 'clf__gpu_use_dp': False,
 'clf': LGBMClassifier(device_type='gpu', gpu_use_dp=False, max_bin=63,
                objective='binary', random_state=42),
 'converge_latent_estimates': True,
 'cv_n_folds': 5,
 'n_jobs': 16,
 'prune_method': 'both',
 'pulearning': 1,
 'seed': None}

In [43]:
wandb.log({'overall_valid_auc': valid_auc,
           'model_params': str(lgb_model.get_params()),
           'model_seed': 42,
           'cleanlab_params': str(rp.get_params())
          })
wandb.finish()


VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.04MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_seed,▁
overall_valid_auc,▁

0,1
cleanlab_params,{'clf__boosting_type...
model_params,{'boosting_type': 'g...
model_seed,42
overall_valid_auc,0.72423


In [44]:
dump(preds, predpath/'cleanlab_lgbm_20211122.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/preds/cleanlab_lgbm_20211122.joblib']

## Submission

In [59]:
X_test = pd.read_feather(dataset_params['test_source'])

In [60]:
scaler = RobustScaler()

In [61]:
scaler.fit(X)

RobustScaler()

In [62]:
scaler.transform(X_test)

array([[-2.54904070e-01,  8.92660373e-01,  1.74689787e+00, ...,
        -3.14632989e-01,  7.56057179e-01, -5.35039504e-01],
       [-2.40419666e-01, -7.73033673e-01, -9.03498547e-01, ...,
         4.04404276e-01, -1.09484041e-01,  7.09297297e-01],
       [ 3.67480353e+00, -3.04606091e-02, -9.53230106e-01, ...,
         2.60323086e-01,  5.22578209e-01,  7.16351788e-02],
       ...,
       [ 7.74437122e-01,  4.76283375e-03, -7.63896086e-01, ...,
        -1.52318988e+00,  7.39754091e+00,  7.52089854e-01],
       [ 4.71329335e+00,  3.45526482e-01, -4.12767694e-01, ...,
        -9.01228907e-01, -3.59849849e-02,  2.78337176e-01],
       [ 1.01900951e+00,  5.92913219e-02,  9.90131581e-01, ...,
        -9.60166650e-01,  1.45144893e-01,  5.01730920e-01]])

In [63]:
dirty_preds = rp.predict_proba(X_test)[:,1]
clean_preds = lgb_model.predict_proba(X_test)[:,1]

In [64]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [65]:
sample_df.head()

Unnamed: 0,id,target
0,600000,0.5
1,600001,0.5
2,600002,0.5
3,600003,0.5
4,600004,0.5


In [67]:
dirty_preds.shape

(540000,)

In [69]:
clean_preds.shape

(540000,)

In [68]:
sample_df.shape

(540000, 2)

In [70]:
sample_df.loc[:, 'target'] = dirty_preds

In [71]:
sample_df.head()

Unnamed: 0,id,target
0,600000,0.866322
1,600001,0.948761
2,600002,0.976896
3,600003,0.810975
4,600004,0.665766


In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [72]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_basic_LGBM_preds.csv", index=False)
# sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-X_orig+KMeans8+synth-GBM-stack_ensemble_preds.csv", index=False)

In [73]:
sample_df.loc[:, 'target'] = clean_preds

In [74]:
sample_df.head()

Unnamed: 0,id,target
0,600000,0.647848
1,600001,0.632269
2,600002,0.797033
3,600003,0.556056
4,600004,0.410237


In [75]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_cleanlab_basic_LGBM_preds.csv", index=False)


# CONCLUSION
Actually, Cleanlab works: LB score with cleaning is 0.72402 compared to 0.71712 with the dirty one.

In [19]:
def cross_validate_model(arch:str, X, y, X_test, params:dict={}, start_fold=0, 
                         exmodel_config=exmodel_config, wandb_config=wandb_config, 
                         random_state=42, shuffle_kfolds=True, wandb_tracked=True, encode_cats=False):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
#     if exmodel_config['kfolds'] == 1:
#         print("Proceeding with holdout")
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                           test_size=0.2, 
#                                                           random_state=SEED)                 
    
    # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    # shuffle on the initial sets, but not subsequently -- performing the same operation twice means a very different dataset
    if shuffle_kfolds:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    else:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)
    
    if wandb_tracked:
        exmodel_config['arch'] = arch
        exmodel_config[f'{arch}_params'] = str(params)
        wandb.init(
            project="202111_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # setup for serialization
    # runpath = Path(modelpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds/")
    # (runpath).mkdir(exist_ok=True)
    
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    test_preds = np.zeros((X_test.shape[0]))
    
    # if using deep learning with pytorch-widedeep, do data preprocessing now, before splits
    if 'widedeep' in arch:
        # NOTE THAT ENCODING NOT DEPLOYED FOR THIS YET
        # preprocessing first
        wide_cols = [f for f in X.columns if X[f].nunique() == 2] #list(X_train.columns) if X_train.iloc[:,f].nunique() == 2] # binary indicator vars are wide
        cont_cols = [f for f in X.columns if X[f].nunique() > 2] #list(X_train.columns) if X_train.iloc[:,f].nunique() > 2] # others are cont

        # wide part
        # wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
        # X_wide = wide_preprocessor.fit_transform(X)
#         print(f"X_wide.shape = {X_wide.shape}")
#         X_wide = np.array(X_train[wide_cols])
        

        # deep part
        tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols)#, embed_cols=embed_cols, )
        X_tab = tab_preprocessor.fit_transform(X)   
#         print(f"X_tab.shape = {X_tab.shape}")
        
        # transforming the test set
        X_test_wide = wide_preprocessor.transform(X_test)
        X_test_tab = tab_preprocessor.transform(X_test)
        
        # at this point, X_wide, X_tab, X_test_wide, and X_test_tab will all be np.ndarrays
    
#     else: # if using a GBM, simply convert the pd.DataFrames to np.ndarrays
#         X = np.array(X) # CAN YOU USE CATEGORY_ENCODERS ON NP.NDARRAYS?
#         X_test = np.array(X_test)
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#         print(f"type(train_ids) = {type(train_ids)} and train_ids.shape = {train_ids.shape}")
#         print(f"type(valid_ids) = {type(valid_ids)} and train_ids.shape = {valid_ids.shape}")
        if fold < start_fold: # skip folds that are already trained
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            y_train, y_valid = y[train_ids], y[valid_ids] # y will be an np.ndarray already; handling will be same regardless of model
            if 'widedeep' in arch: # handle wide and deep tabs in parallel
                X_train_wide, X_valid_wide = X_wide[train_ids, :], X_wide[valid_ids, :]
                X_train_tab, X_valid_tab = X_tab[train_ids, :], X_tab[valid_ids, :]
#                 print(f"X_train_wide.shape = {X_train_wide.shape}")
#                 print(f"X_train_tab.shape = {X_train_tab.shape}")
#                 print(f"X_test_wide.shape = {X_test_wide.shape}")
#                 print(f"X_test_tab.shape = {X_test_tab.shape}")
            else: # handle datasets for GBMs
                if isinstance(X, np.ndarray):
                    X_train, X_valid = X[train_ids], X[valid_ids]
                else:
                    X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:] # bc need pandas.DataFrames for ce
                # if encode_cats:
                #     encoder = ce.WOEEncoder(cols=categoricals)
                #     encoder.fit(X_train,y_train)
                #     X_train = encoder.transform(X_train)
                #     X_valid = encoder.transform(X_valid)
                # # exmodel_config['feature_count'] = len(X.columns)
                #     wandb.log({
                #         'feature_count': X_train.shape[1],
                #         'instance_count': X_train.shape[0],
                #         'encoder': str(encoder)
                #     })
#                 exmodel_config['instance_count'] = X_train.shape[0]
#                 exmodel_config['encoder'] = str(encoder)
#                     X_test = encoder.transform(X_test)
#                 y_train, y_valid = y[train_ids], y[valid_ids]
            
        # scaling
        scaler = RobustScaler()
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.transform(X_valid)
    
        # define models
        if arch == 'xgboost':
            model = XGBClassifier(
                booster='gbtree',
                tree_method='gpu_hist',
                random_state=random_state,
                n_jobs=-1, 
                verbosity=1, 
                objective='binary:logistic',
                **params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
            else:
                model.fit(X_train, y_train)
            y_valid_preds = model.predict_proba(X_valid)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict_proba(X_test)[:,1]


        elif arch == 'lightgbm':
            try:
                model = LGBMClassifier(
                    objective='binary',
                    random_state=random_state,
#                     device_type='cpu',
#                     n_jobs=-1,
    #                 eval_metric='auc',
                    device_type='gpu',
                    max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
                    gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
                    **params)
                
                if wandb_tracked:
                    model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
                else:
                    model.fit(X_train, y_train)
            except LightGBMError:
                model = LGBMClassifier(
                    objective='binary',
                    random_state=random_state,
                    device_type='cpu',
                    n_jobs=-1,
    #                 eval_metric='auc',
    #                 device_type='gpu',
    #                 max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
    #                 gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
                    **params)
                
                if wandb_tracked:
                    model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
                else:
                    model.fit(X_train, y_train)
            y_valid_preds = model.predict_proba(X_valid)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict_proba(X_test)[:,1]

            
        elif arch == 'catboost':
            model = CatBoostClassifier(
                task_type='GPU',
                silent=True,
                random_state=random_state,
                **params) 
        
            model.fit(X_train, y_train)
            y_valid_preds = model.predict_proba(X_valid)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict_proba(X_test)[:,1]
            
        elif 'widedeep' in arch: # only coding for TabMlp right now
#             X_train = pd.DataFrame(X_train, columns=[f"f{x}" for x in range(X_train.shape[1])])
#             X_valid = pd.DataFrame(X_valid, columns=[f"f{x}" for x in range(X_valid.shape[1])])
#             X_test = pd.DataFrame(X_test, columns=[f"f{x}" for x in range(X_test.shape[1])])
            
            wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
            deeptabular = TabMlp(
                mlp_hidden_dims=[64,32],
                column_idx=tab_preprocessor.column_idx,
            #     embed_input=tab_preprocessor.embeddings_input,
                continuous_cols=cont_cols,
            )
            
            # model instantiation and training
            model = WideDeep(wide=wide, deeptabular=deeptabular)
            
            
            n_epochs = 300

            # pytorch hyperparams
            wide_opt = AdamW(model.wide.parameters(), lr=0.1)
            deep_opt = AdamW(model.deeptabular.parameters(), lr=0.1)
            
            wide_sch = OneCycleLR(optimizer=wide_opt, max_lr=0.01, steps_per_epoch=X_train_wide.shape[0], epochs=n_epochs)
            deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_train_tab.shape[0], epochs=n_epochs)
            
            optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
            lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }
            
            
            callbacks = [
                LRHistory(n_epochs=n_epochs), 
            ]
            
            # trainer
            trainer = Trainer(model=model, 
                              objective='binary', 
                              metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                              seed=random_state, 
                              optimizers=optimizers,
                              callbacks=callbacks
                             )
            
#             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
            trainer.fit( # this is where problem is beginning
                X_wide=X_train_wide,
                X_tab=X_train_tab,
                target=y_train,
                n_epochs=n_epochs,
                batch_size=1024, # default value is 32
#                 val_split=0.2, # no need for this
            )
            
            y_valid_preds = trainer.predict_proba(X_wide=X_valid_wide, X_tab=X_valid_tab, batch_size=1024)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            
            # test set inference
            fold_test_preds = trainer.predict_proba(X_wide=X_test_wide, X_tab=X_test_tab, batch_size=1024)[:,1]
            test_preds += fold_test_preds
            

        
        
       
        
        

#         valid_loss = log_loss(y_valid, y_pred)
        # give the valid AUC score, for edification
        fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
        if wandb_tracked:
            wandb.log({f'fold{fold}_valid_roc_auc': fold_valid_auc})
        print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
        # dump(model, Path(runpath/f"{arch}_fold{fold}_rs{random_state}_model.joblib"))

    model_valid_auc = roc_auc_score(oof_y, oof_preds)
    print(f"Valid AUC score for {arch} model is {model_valid_auc}")
    if wandb_tracked:
        wandb.log({'overall_valid_auc': model_valid_auc,
                   'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()),
                   'model_seed': random_state,
                  })
        wandb.finish()
    
    # finalize test preds
    test_preds /= exmodel_config['kfolds']
    
    # save OOF preds and test-set preds
#     if 'widedeep' in arch:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_test_preds.joblib"))
    
#     else:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    
    if not (datapath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib").is_file():
        dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
#     if wandb_tracked:
# #         if 'widedeep' in arch:
#         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
#                    'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()), 
#         #                    'model_params': str(model.get_params()),
#         })
# #         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
# # #                    'model_params': str(model.get_params()),
# #                   })
#         wandb.finish()
    return oof_preds, test_preds
        