# CleanLab with WideDeep experiment
Want to see if I can use the WideDeep classifier with CleanLab without resorting to `skorch` or a custom class.

In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"cleanlab_{datetime.now().strftime('%Y%m%d')}.ipynb"

Now, non-stdlib imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

# from BorutaShap import BorutaShap
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
from optuna.samplers import TPESampler
from sklearn.utils import resample

In [5]:
from cleanlab.classification import LearningWithNoisyLabels

In [6]:
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep#, SAINT, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from torchmetrics import AUROC
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [7]:
# import category_encoders as ce

Now, datapath setup

In [8]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [datapath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


## Helpers

In [9]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

In [10]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

## Metadata

In [11]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    # 'train_source': str(datapath/'X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'train_source': str(datapath/'X_orig.feather'),
    'target_source': str(datapath/'y_orig.joblib'),
    # 'test_source': str(datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'test_source': str(datapath/'X_test_orig-no_scaling.feather'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
# X = load(dataset_params['train_source'])
X = pd.read_feather(dataset_params['train_source'])
y = load(dataset_params['target_source'])
# X_test = load(dataset_params['test_source'])
X_test = pd.read_feather(dataset_params['test_source'])

dataset_params['feature_count'] = X.shape[1]
dataset_params['instance_count'] = X.shape[0]
    

In [15]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
# in the sweep version, this includes both ex-model parameters and defaults for model parameters
exmodel_config = {
    "arch": 'widedeep-TabMLP',
    # "type": 'sweep',
    "denoising": "cleanlab",
    "level": 1,
    'random_state': SEED,
    # 'tuner': "Optuna",
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
    'scaler': str(RobustScaler()),
    **dataset_params
}

wandb_config = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202111_Kaggle_tabular_playground',
    'tags': ['experiment'],
    'notes': "Going to try CleanLab with the 'best' params from the previous sweep, but wanting to see what the actual LB score is, and how it compares to the noisy one. Using a default LGBMClassfier with holdout on robust-scaled original dataset.",
    'config': exmodel_config,
}

In [12]:
# decrease memory footprint
X = reduce_memory_usage(X)
X_test = reduce_memory_usage(X_test)

  and should_run_async(code)


Mem. usage decreased to 114.44 Mb (75.0% reduction)
Mem. usage decreased to 103.00 Mb (75.0% reduction)


In [13]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
#     'random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
    **dataset_params
#     'features_created': False,
#     'feature_creator': None,
}

In [14]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['widedeep', 'deeplearning'],
    'notes': "Trying a variety of widedeep models, to see if I can get any working properly."
}

# Preprocessing Data
Inspired a bit by Laurent Pourchot's Aug2021 Tabular Playground entry, I'm going to try to generate two versions of the dataset: a categorical one, using bins, and then (for now) a GaussRankScaled one. In the future, I might add further variations, e.g. with feature reduction via PCA and perhaps also UMAP and also denoising; I might also try other normalizations, e.g. Quantile.

## Binning (Generating wide cols)

In [15]:
# h/t Laurent Pourchot https://www.kaggle.com/pourchot/in-python-tabular-denoising-residual-network/

# 100 bins for the bins head of the NN (i.e. percentiles):
X_bins = np.zeros((X.shape[0],X.shape[1])) # he used all available data for the first tuple entry, but I'll start like this

In [16]:
X_bins.shape

(600000, 100)

In [17]:
for i in range(X.shape[1]): # assumes X is a pd.DataFrame
    X_bins[:,i] = pd.qcut(X.iloc[:,i],X.shape[1],labels=False,duplicates = 'drop')
# blabeled = X_bins[:X.shape[0],:]
# bunlabeled = X_ins[X.shape[0]:,:]

In [18]:
X_bins

  and should_run_async(code)


array([[52., 70., 49., ..., 37., 64., 88.],
       [56., 35., 33., ..., 48., 10., 28.],
       [28., 31., 70., ..., 40., 79., 85.],
       ...,
       [94., 11., 31., ..., 16., 64., 46.],
       [80., 75., 89., ..., 58., 51., 17.],
       [64., 67., 56., ..., 37., 44., 41.]])

In [19]:
X_bins = X_bins.astype(np.int8)

In [20]:
X_bins = pd.DataFrame(X_bins, index=X.index, columns=[f'rkd_f{col}' for col in range(100)])

In [21]:
X_bins.head()

Unnamed: 0,rkd_f0,rkd_f1,rkd_f2,rkd_f3,rkd_f4,rkd_f5,rkd_f6,rkd_f7,rkd_f8,rkd_f9,...,rkd_f90,rkd_f91,rkd_f92,rkd_f93,rkd_f94,rkd_f95,rkd_f96,rkd_f97,rkd_f98,rkd_f99
0,52,70,49,60,61,23,68,44,42,20,...,18,22,22,6,44,52,83,37,64,88
1,56,35,33,63,69,95,22,1,88,36,...,90,70,24,94,87,97,2,48,10,28
2,28,31,70,41,29,60,94,75,36,48,...,86,94,58,30,76,10,0,40,79,85
3,11,7,89,34,18,30,48,87,88,83,...,6,70,11,7,49,35,10,37,39,41
4,15,73,55,40,24,41,78,3,66,33,...,20,40,4,86,79,32,20,19,13,58


## Normalizing (Preprocessing Deep Cols)

In [22]:
scaler = GaussRankScaler()
X_gauss = scaler.fit_transform(X)

  and should_run_async(code)


In [23]:
X.head()

  and should_run_async(code)


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.106628,3.59375,132.75,3.183594,0.08197,1.188477,3.732422,2.265625,2.099609,0.012329,...,0.010742,1.098633,0.013329,-0.011719,0.052765,0.06543,4.210938,1.978516,0.085999,0.240479
1,0.125,1.673828,76.5625,3.378906,0.099426,5.09375,1.275391,-0.471436,4.546875,0.03772,...,0.135864,3.460938,0.017059,0.124878,0.154053,0.606934,-0.267822,2.578125,-0.020874,0.024719
2,0.036316,1.49707,233.5,2.195312,0.026917,3.126953,5.058594,3.849609,1.801758,0.057007,...,0.11731,4.882812,0.085205,0.03241,0.116089,-0.001689,-0.52002,2.140625,0.124451,0.148193
3,-0.014076,0.245972,780.0,1.890625,0.006947,1.53125,2.697266,4.515625,4.503906,0.123474,...,-0.01535,3.474609,-0.017105,-0.008102,0.062012,0.041199,0.511719,1.96875,0.040009,0.044861
4,-0.00326,3.714844,156.125,2.148438,0.01828,2.097656,4.15625,-0.038239,3.371094,0.03418,...,0.013779,1.910156,-0.042938,0.105591,0.125122,0.037506,1.043945,1.075195,-0.012817,0.072815


In [24]:
X_gauss = pd.DataFrame(X_gauss, columns=X.columns, index=X.index)

  and should_run_async(code)


In [25]:
X_gauss.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.503864,1.170148,0.497627,1.061322,0.497063,0.697261,1.178046,0.918892,0.89803,0.26461,...,0.313791,0.697968,0.228935,-0.745756,0.410898,0.511767,1.301817,0.886265,0.491337,0.715292
1,0.532606,0.817177,0.414955,1.09994,0.523378,1.430099,0.707201,-1.001742,1.361183,0.395362,...,0.712643,1.133297,0.258559,0.941823,0.550487,0.937882,-0.799881,0.975537,-0.595291,0.375001
2,0.364383,0.777697,0.581712,0.902238,0.358373,1.046707,1.450572,1.213861,0.833888,0.448396,...,0.684576,1.420673,0.444982,0.597667,0.51478,-0.345367,-1.076664,0.911155,0.547897,0.627946
3,-0.552709,0.361766,0.800397,0.847488,0.20525,0.767239,0.975344,1.338971,1.352906,0.554931,...,-0.698743,1.136185,-0.622266,-0.67141,0.436283,0.439042,0.506585,0.883681,0.397615,0.451311
4,-0.357576,1.199284,0.516476,0.895768,0.313432,0.890002,1.273778,-0.348193,1.110106,0.385873,...,0.349392,0.873186,-0.772866,0.873466,0.528289,0.427691,0.67996,0.684102,-0.528382,0.520949


## Preparing Data for WideDeep

In [26]:
X_pre = X_gauss.join(X_bins)

  and should_run_async(code)


In [27]:
X_pre.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,rkd_f90,rkd_f91,rkd_f92,rkd_f93,rkd_f94,rkd_f95,rkd_f96,rkd_f97,rkd_f98,rkd_f99
0,0.503864,1.170148,0.497627,1.061322,0.497063,0.697261,1.178046,0.918892,0.89803,0.26461,...,18,22,22,6,44,52,83,37,64,88
1,0.532606,0.817177,0.414955,1.09994,0.523378,1.430099,0.707201,-1.001742,1.361183,0.395362,...,90,70,24,94,87,97,2,48,10,28
2,0.364383,0.777697,0.581712,0.902238,0.358373,1.046707,1.450572,1.213861,0.833888,0.448396,...,86,94,58,30,76,10,0,40,79,85
3,-0.552709,0.361766,0.800397,0.847488,0.20525,0.767239,0.975344,1.338971,1.352906,0.554931,...,6,70,11,7,49,35,10,37,39,41
4,-0.357576,1.199284,0.516476,0.895768,0.313432,0.890002,1.273778,-0.348193,1.110106,0.385873,...,20,40,4,86,79,32,20,19,13,58


In [65]:
cont_cols = X_pre.iloc[:,:100].columns
wide_cols = X_pre.iloc[:, 100:].columns

  and should_run_async(code)


In [66]:
cont_cols

Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
       'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
       'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40',
       'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50',
       'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60',
       'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70',
       'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80',
       'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90',
       'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99'],
      dtype='object')

- So here, I've stored the column names in a list-like object (i.e. a pandas `Index`) 

In [67]:
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
X_wide = wide_preprocessor.fit_transform(X_pre)

In [70]:
tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols, scale=False, for_transformer=False,embed_cols=wide_cols)
X_tab = tab_preprocessor.fit_transform(X_pre)

In [71]:
X_wide.shape, X_tab.shape

((600000, 100), (600000, 200))

- So at this point, I've created two parallel versions of the dataset -- both contain all the instances, but the wide version contains only the categorical (binned) values, and the other contains both; the `embed_cols=` kwarg specifies that an embedding should be generated on the basis of them.

In [72]:
X_tab[10,:]

array([ 8.        , 11.        , 11.        , 11.        , 11.        ,
       11.        , 11.        , 10.        ,  9.        , 11.        ,
        9.        , 11.        ,  9.        , 10.        , 11.        ,
       11.        , 11.        , 11.        , 10.        ,  9.        ,
       10.        , 11.        , 11.        , 10.        ,  9.        ,
       10.        , 11.        , 11.        , 11.        ,  9.        ,
       11.        , 11.        , 11.        ,  4.        , 11.        ,
       11.        , 11.        , 10.        , 10.        , 11.        ,
       11.        , 11.        , 10.        , 10.        , 11.        ,
       10.        , 11.        , 11.        , 10.        , 10.        ,
        1.        , 11.        , 11.        , 11.        , 11.        ,
       11.        , 11.        , 11.        ,  1.        , 11.        ,
       10.        , 11.        , 11.        ,  9.        , 11.        ,
       11.        , 10.        , 11.        , 11.        , 11.  

- Note that the ones bound for embedding go first. 
- Not sure about the values they've generated -- it's possible that the WideDeep library's preprocessing is inappropriate post-binning

In [33]:
import torchinfo

  and should_run_async(code)


In [73]:
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)

In [74]:
torchinfo.summary(wide)

Layer (type:depth-idx)                   Param #
Wide                                     --
├─Embedding: 1-1                         10,001
Total params: 10,001
Trainable params: 10,001
Non-trainable params: 0

In [43]:
# X_pre.loc[:, X_bins.columns]

  and should_run_async(code)


In [62]:
# X_wide_train = np.array(X_wide_train)
# X_tab_train = np.array(X_tab_train)
# X_wide_valid = np.array(X_wide_valid)
# X_tab_valid = np.array(X_tab_valid)

In [75]:
# deeptabular = TabMlp(continuous_cols=X_gauss.columns, column_idx=tab_preprocessor.column_idx)
deeptabular = TabMlp(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx)

In [76]:
model = WideDeep(wide=wide, deeptabular=deeptabular)

In [77]:
torchinfo.summary(model)

Layer (type:depth-idx)                        Param #
WideDeep                                      --
├─Wide: 1-1                                   --
│    └─Embedding: 2-1                         10,001
├─Sequential: 1-2                             --
│    └─TabMlp: 2-2                            --
│    │    └─CatEmbeddingsAndCont: 3-1         200
│    │    └─MLP: 3-2                          40,300
│    └─Linear: 2-3                            101
Total params: 50,602
Trainable params: 50,602
Non-trainable params: 0

In [86]:
# del X_deep_train, X_deep_valid
X_wide_train, X_wide_valid, y_train, y_valid = train_test_split(X_wide, y, test_size=0.2, random_state=42)
X_tab_train, X_tab_valid, _, _ = train_test_split(X_tab, y, test_size=0.2, random_state=42)

In [87]:
X_wide_train.shape, X_tab_train.shape

((480000, 100), (480000, 200))

In [90]:
# X_wide_train = pd.DataFrame(X_wide_train, columns=wide_cols)
# X_wide_valid = pd.DataFrame(X_wide_valid, columns=wide_cols)
type(y_train)
# type(X_wide_train)

pandas.core.series.Series

In [91]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [92]:
n_epochs = 25

wide_opt = AdamW(model.wide.parameters(), lr=0.1)
deep_opt = AdamW(model.deeptabular.parameters(), lr=0.1)

wide_sch = OneCycleLR(optimizer=wide_opt, max_lr=0.01, steps_per_epoch=X_wide_train.shape[0], epochs=n_epochs)
deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_tab_train.shape[0], epochs=n_epochs)

optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }


callbacks = [
    LRHistory(n_epochs=n_epochs), 
]

# trainer
trainer = Trainer(model=model, 
                  objective='binary', 
                  metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                  seed=42, 
                  optimizers=optimizers,
                  callbacks=callbacks
                 )

#             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
trainer.fit( # this is where problem is beginning
    X_wide=X_wide_train,
    X_tab=X_tab_train,
    target=y_train,
    n_epochs=n_epochs,
    batch_size=1024, # default value is 32
#                 val_split=0.2, # no need for this
)

y_valid_preds = trainer.predict_proba(X_wide=X_wide_valid, X_tab=X_tab_valid, batch_size=1024)[:,1]
           
    

epoch 1: 100%|██████████| 469/469 [00:06<00:00, 73.32it/s, loss=2.12, metrics={'acc': 0.5511}]
epoch 2: 100%|██████████| 469/469 [00:05<00:00, 78.48it/s, loss=1.94, metrics={'acc': 0.5702}]
epoch 3: 100%|██████████| 469/469 [00:05<00:00, 80.71it/s, loss=1.81, metrics={'acc': 0.5761}]
epoch 4: 100%|██████████| 469/469 [00:05<00:00, 78.26it/s, loss=1.68, metrics={'acc': 0.5815}]
epoch 5: 100%|██████████| 469/469 [00:05<00:00, 82.15it/s, loss=1.57, metrics={'acc': 0.5862}]
epoch 6: 100%|██████████| 469/469 [00:05<00:00, 82.79it/s, loss=1.45, metrics={'acc': 0.5905}]
epoch 7: 100%|██████████| 469/469 [00:05<00:00, 78.27it/s, loss=1.35, metrics={'acc': 0.5948}]
epoch 8: 100%|██████████| 469/469 [00:05<00:00, 79.83it/s, loss=1.25, metrics={'acc': 0.598}] 
epoch 9: 100%|██████████| 469/469 [00:06<00:00, 77.58it/s, loss=1.16, metrics={'acc': 0.6026}]
epoch 10: 100%|██████████| 469/469 [00:05<00:00, 80.89it/s, loss=1.07, metrics={'acc': 0.6071}]
epoch 11: 100%|██████████| 469/469 [00:05<00:00, 

## LGBM Model and Noisy Run

In [16]:
wandb.init(
    project="202111_Kaggle_tabular_playground",
    save_code=True,
    tags=wandb_config['tags'],
    name=wandb_config['name'],
    notes=wandb_config['notes'],
    config=exmodel_config
) 

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [45]:
lgb_model = LGBMClassifier(
        objective='binary',
        random_state=42,
        #                     device_type='cpu',
        #                     n_jobs=-1,
        #                 eval_metric='auc',
        device_type='gpu',
        max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
        gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
    )

In [46]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
scaler = RobustScaler()

In [48]:
scaler.fit_transform(X_train)

array([[-0.27296268,  0.01579322, -0.19031588, ...,  0.50363337,
         0.17720313, -0.58369832],
       [-0.18106382,  0.2410393 ,  5.30773161, ...,  0.24955035,
        -0.48450355, -0.62285755],
       [ 6.07300903, -0.85078822,  0.17280319, ..., -0.62764574,
        -0.67894852,  0.73726127],
       ...,
       [ 4.18856991, -0.68824246,  0.21225385, ...,  0.88726112,
         6.19955807, -0.09918299],
       [-0.300117  ,  0.13080398, -0.54802214, ..., -0.3979886 ,
        -0.01474   ,  0.22398866],
       [-0.29279129,  0.62179137,  0.07317624, ...,  0.14508993,
        12.038684  ,  0.6437292 ]])

In [49]:
scaler.transform(X_valid)

array([[ 1.27745959, -0.79095855,  3.57879036, ...,  0.40431588,
         0.30027418,  0.45467042],
       [-0.35032922, -0.36662758, -0.83368479, ...,  0.48734142,
        -0.59656634,  9.97028335],
       [ 2.07990909,  0.20997962, -0.0127639 , ..., -0.80240405,
        -0.22697866,  0.23615553],
       ...,
       [ 0.80633635,  0.39936366,  0.49845399, ...,  0.68680344,
        -1.04204429,  0.24502101],
       [-0.06863608, -0.27843164, -0.36283929, ...,  0.18702425,
         0.57010813, -0.05371796],
       [-0.3620719 , -0.30144763, -0.06982129, ..., -0.73907299,
        -1.0792864 ,  0.24365673]])

In [50]:
lgb_model.fit(X_train, y_train)

LGBMClassifier(device_type='gpu', gpu_use_dp=False, max_bin=63,
               objective='binary', random_state=42)

In [51]:
preds = lgb_model.predict_proba(X_valid)[:,1]
valid_auc = roc_auc_score(y_true=y_valid, y_score=preds)
valid_auc

0.7322242977619932

In [52]:
dump(preds, predpath/'noisy_lgbm_20211122.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/preds/noisy_lgbm_20211122.joblib']

In [30]:
wandb.log({'overall_valid_auc': valid_auc,
           'model_params': str(lgb_model.get_params()),
           'model_seed': 42,
          })
wandb.finish()


VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.02MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_seed,▁
overall_valid_auc,▁

0,1
model_params,{'boosting_type': 'g...
model_seed,42
overall_valid_auc,0.73222


## Clean Run

In [31]:
exmodel_config['denoising'] = 'cleanlab'

wandb_config = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202111_Kaggle_tabular_playground',
    'tags': ['experiment'],
    'notes': "Going to try CleanLab with the 'best' params from the previous sweep, but wanting to see what the actual LB score is, and how it compares to the noisy one. Using a default LGBMClassfier with holdout on robust-scaled original dataset.",
    'config': exmodel_config,
}

wandb.init(
    project="202111_Kaggle_tabular_playground",
    save_code=True,
    tags=wandb_config['tags'],
    name=wandb_config['name'],
    notes=wandb_config['notes'],
    config=exmodel_config
) 

[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [32]:
rp_params = {
    # 'prune_method': trial.suggest_categorical('prune_method', ['prune_by_noise_rate', 'prune_by_class', 'both']),
    # 'converge_latent_estimates': trial.suggest_categorical('converge_latent_estimates', [True, False]),
    # 'pulearning': trial.suggest_categorical('pulearning', [0,1,None])
    'prune_method': 'both',
    'converge_latent_estimates': True,
    'pulearning': 1,
}

In [33]:
lgb_model = LGBMClassifier(
        objective='binary',
        random_state=42,
        #                     device_type='cpu',
        #                     n_jobs=-1,
        #                 eval_metric='auc',
        device_type='gpu',
        max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
        gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
    )

In [34]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
scaler = RobustScaler()

In [36]:
scaler.fit_transform(X_train)

array([[-0.27296268,  0.01579322, -0.19031588, ...,  0.50363337,
         0.17720313, -0.58369832],
       [-0.18106382,  0.2410393 ,  5.30773161, ...,  0.24955035,
        -0.48450355, -0.62285755],
       [ 6.07300903, -0.85078822,  0.17280319, ..., -0.62764574,
        -0.67894852,  0.73726127],
       ...,
       [ 4.18856991, -0.68824246,  0.21225385, ...,  0.88726112,
         6.19955807, -0.09918299],
       [-0.300117  ,  0.13080398, -0.54802214, ..., -0.3979886 ,
        -0.01474   ,  0.22398866],
       [-0.29279129,  0.62179137,  0.07317624, ...,  0.14508993,
        12.038684  ,  0.6437292 ]])

In [37]:
scaler.transform(X_valid)

array([[ 1.27745959, -0.79095855,  3.57879036, ...,  0.40431588,
         0.30027418,  0.45467042],
       [-0.35032922, -0.36662758, -0.83368479, ...,  0.48734142,
        -0.59656634,  9.97028335],
       [ 2.07990909,  0.20997962, -0.0127639 , ..., -0.80240405,
        -0.22697866,  0.23615553],
       ...,
       [ 0.80633635,  0.39936366,  0.49845399, ...,  0.68680344,
        -1.04204429,  0.24502101],
       [-0.06863608, -0.27843164, -0.36283929, ...,  0.18702425,
         0.57010813, -0.05371796],
       [-0.3620719 , -0.30144763, -0.06982129, ..., -0.73907299,
        -1.0792864 ,  0.24365673]])

In [39]:
X_train = np.array(X_train)
X_valid = np.array(X_valid)
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [40]:
rp = LearningWithNoisyLabels(clf=lgb_model, **rp_params)
rp.fit(X_train, y_train)

LGBMClassifier(device_type='gpu', gpu_use_dp=False, max_bin=63,
               objective='binary', random_state=42)

In [41]:
preds = rp.predict_proba(X_valid)[:,1]
valid_auc = roc_auc_score(y_true=y_valid, y_score=preds)
valid_auc

0.7242336321944994

In [42]:
rp.get_params()

{'clf__boosting_type': 'gbdt',
 'clf__class_weight': None,
 'clf__colsample_bytree': 1.0,
 'clf__importance_type': 'split',
 'clf__learning_rate': 0.1,
 'clf__max_depth': -1,
 'clf__min_child_samples': 20,
 'clf__min_child_weight': 0.001,
 'clf__min_split_gain': 0.0,
 'clf__n_estimators': 100,
 'clf__n_jobs': -1,
 'clf__num_leaves': 31,
 'clf__objective': 'binary',
 'clf__random_state': 42,
 'clf__reg_alpha': 0.0,
 'clf__reg_lambda': 0.0,
 'clf__silent': 'warn',
 'clf__subsample': 1.0,
 'clf__subsample_for_bin': 200000,
 'clf__subsample_freq': 0,
 'clf__device_type': 'gpu',
 'clf__max_bin': 63,
 'clf__gpu_use_dp': False,
 'clf': LGBMClassifier(device_type='gpu', gpu_use_dp=False, max_bin=63,
                objective='binary', random_state=42),
 'converge_latent_estimates': True,
 'cv_n_folds': 5,
 'n_jobs': 16,
 'prune_method': 'both',
 'pulearning': 1,
 'seed': None}

In [43]:
wandb.log({'overall_valid_auc': valid_auc,
           'model_params': str(lgb_model.get_params()),
           'model_seed': 42,
           'cleanlab_params': str(rp.get_params())
          })
wandb.finish()


VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.04MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_seed,▁
overall_valid_auc,▁

0,1
cleanlab_params,{'clf__boosting_type...
model_params,{'boosting_type': 'g...
model_seed,42
overall_valid_auc,0.72423


In [44]:
dump(preds, predpath/'cleanlab_lgbm_20211122.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/preds/cleanlab_lgbm_20211122.joblib']

## Submission

In [59]:
X_test = pd.read_feather(dataset_params['test_source'])

In [60]:
scaler = RobustScaler()

In [61]:
scaler.fit(X)

RobustScaler()

In [62]:
scaler.transform(X_test)

array([[-2.54904070e-01,  8.92660373e-01,  1.74689787e+00, ...,
        -3.14632989e-01,  7.56057179e-01, -5.35039504e-01],
       [-2.40419666e-01, -7.73033673e-01, -9.03498547e-01, ...,
         4.04404276e-01, -1.09484041e-01,  7.09297297e-01],
       [ 3.67480353e+00, -3.04606091e-02, -9.53230106e-01, ...,
         2.60323086e-01,  5.22578209e-01,  7.16351788e-02],
       ...,
       [ 7.74437122e-01,  4.76283375e-03, -7.63896086e-01, ...,
        -1.52318988e+00,  7.39754091e+00,  7.52089854e-01],
       [ 4.71329335e+00,  3.45526482e-01, -4.12767694e-01, ...,
        -9.01228907e-01, -3.59849849e-02,  2.78337176e-01],
       [ 1.01900951e+00,  5.92913219e-02,  9.90131581e-01, ...,
        -9.60166650e-01,  1.45144893e-01,  5.01730920e-01]])

In [63]:
dirty_preds = rp.predict_proba(X_test)[:,1]
clean_preds = lgb_model.predict_proba(X_test)[:,1]

In [64]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [65]:
sample_df.head()

Unnamed: 0,id,target
0,600000,0.5
1,600001,0.5
2,600002,0.5
3,600003,0.5
4,600004,0.5


In [67]:
dirty_preds.shape

(540000,)

In [69]:
clean_preds.shape

(540000,)

In [68]:
sample_df.shape

(540000, 2)

In [70]:
sample_df.loc[:, 'target'] = dirty_preds

In [71]:
sample_df.head()

Unnamed: 0,id,target
0,600000,0.866322
1,600001,0.948761
2,600002,0.976896
3,600003,0.810975
4,600004,0.665766


In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [72]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_basic_LGBM_preds.csv", index=False)
# sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-X_orig+KMeans8+synth-GBM-stack_ensemble_preds.csv", index=False)

In [73]:
sample_df.loc[:, 'target'] = clean_preds

In [74]:
sample_df.head()

Unnamed: 0,id,target
0,600000,0.647848
1,600001,0.632269
2,600002,0.797033
3,600003,0.556056
4,600004,0.410237


In [75]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_cleanlab_basic_LGBM_preds.csv", index=False)


# CONCLUSION
Actually, Cleanlab works: LB score with cleaning is 0.72402 compared to 0.71712 with the dirty one.