In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"cleanlab_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

# from BorutaShap import BorutaShap
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
from optuna.samplers import TPESampler
from sklearn.utils import resample

In [5]:
from cleanlab.classification import LearningWithNoisyLabels

In [6]:
# from pytorch_widedeep import Trainer
# from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
# from pytorch_widedeep.models import Wide, TabMlp, WideDeep#, SAINT, TabTransformer, TabNet, TabFastFormer, TabResnet
# from pytorch_widedeep.metrics import Accuracy
# from torchmetrics import AUROC
# import torch
# from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
# from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
# from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [7]:
# import category_encoders as ce

In [8]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [datapath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    

In [9]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

In [10]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [11]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    # 'train_source': str(datapath/'X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'train_source': str(datapath/'X_orig.feather'),
    'target_source': str(datapath/'y_orig.joblib'),
    # 'test_source': str(datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'test_source': str(datapath/'X_test_orig-no_scaling.feather'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
# X = load(dataset_params['train_source'])
X = pd.read_feather(dataset_params['train_source'])
y = load(dataset_params['target_source'])
# X_test = load(dataset_params['test_source'])
# X_test = pd.read_feather(dataset_params['test_source'])

dataset_params['feature_count'] = X.shape[1]
dataset_params['instance_count'] = X.shape[0]
    

In [12]:
scaler = RobustScaler()

In [13]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
# in the sweep version, this includes both ex-model parameters and defaults for model parameters
exmodel_config = {
    "arch": 'lightgbm',
    # "type": 'sweep',
    # "denoising": "cleanlab",
    "level": 1,
    'random_state': SEED,
    # 'tuner': "Optuna",
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
    'scaler': str(RobustScaler()),
    **dataset_params
}

wandb_kwargs = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202111_Kaggle_tabular_playground',
    'tags': ['experiment'],
    'notes': "Going to try CleanLab with the 'best' params from the previous sweep, but wanting to see what the actual LB score is, and how it compares to the noisy one. Using a default LGBMClassfier with holdout on robust-scaled original dataset.",
    'config': exmodel_config,
}

In [14]:
wandb.init(
    project="202111_Kaggle_tabular_playground",
    save_code=True,
    tags=wandb_config['tags'],
    name=wandb_config['name'],
    notes=wandb_config['notes'],
    config=exmodel_config
) 

In [15]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
# in the sweep version, this includes both ex-model parameters and defaults for model parameters
exmodel_config = {
    "arch": 'lightgbm',
    # "type": 'sweep',
    # "denoising": "cleanlab",
    "level": 1,
    'random_state': SEED,
    # 'tuner': "Optuna",
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
    'scaler': str(RobustScaler()),
    **dataset_params
}

wandb_config = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202111_Kaggle_tabular_playground',
    'tags': ['experiment'],
    'notes': "Going to try CleanLab with the 'best' params from the previous sweep, but wanting to see what the actual LB score is, and how it compares to the noisy one. Using a default LGBMClassfier with holdout on robust-scaled original dataset.",
    'config': exmodel_config,
}

In [16]:
wandb.init(
    project="202111_Kaggle_tabular_playground",
    save_code=True,
    tags=wandb_config['tags'],
    name=wandb_config['name'],
    notes=wandb_config['notes'],
    config=exmodel_config
) 

<wandb.sdk.wandb_run.Run at 0x7f735419d760>

In [17]:
lgb_model = LGBMClassifier(
        objective='binary',
        random_state=42,
        #                     device_type='cpu',
        #                     n_jobs=-1,
        #                 eval_metric='auc',
        device_type='gpu',
        max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
        gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
    )

In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
scaler = RobustScaler()

In [20]:
scaler.fit_transform(X_train,y_train)

array([[-0.27296268,  0.01579322, -0.19031588, ...,  0.50363337,
         0.17720313, -0.58369832],
       [-0.18106382,  0.2410393 ,  5.30773161, ...,  0.24955035,
        -0.48450355, -0.62285755],
       [ 6.07300903, -0.85078822,  0.17280319, ..., -0.62764574,
        -0.67894852,  0.73726127],
       ...,
       [ 4.18856991, -0.68824246,  0.21225385, ...,  0.88726112,
         6.19955807, -0.09918299],
       [-0.300117  ,  0.13080398, -0.54802214, ..., -0.3979886 ,
        -0.01474   ,  0.22398866],
       [-0.29279129,  0.62179137,  0.07317624, ...,  0.14508993,
        12.038684  ,  0.6437292 ]])

In [21]:
scaler.transform(X_valid, y_valid)

In [22]:
scaler.transform(X_valid)

array([[ 1.27745959, -0.79095855,  3.57879036, ...,  0.40431588,
         0.30027418,  0.45467042],
       [-0.35032922, -0.36662758, -0.83368479, ...,  0.48734142,
        -0.59656634,  9.97028335],
       [ 2.07990909,  0.20997962, -0.0127639 , ..., -0.80240405,
        -0.22697866,  0.23615553],
       ...,
       [ 0.80633635,  0.39936366,  0.49845399, ...,  0.68680344,
        -1.04204429,  0.24502101],
       [-0.06863608, -0.27843164, -0.36283929, ...,  0.18702425,
         0.57010813, -0.05371796],
       [-0.3620719 , -0.30144763, -0.06982129, ..., -0.73907299,
        -1.0792864 ,  0.24365673]])

In [23]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
scaler = RobustScaler()

In [25]:
scaler.fit_transform(X_train)

array([[-0.27296268,  0.01579322, -0.19031588, ...,  0.50363337,
         0.17720313, -0.58369832],
       [-0.18106382,  0.2410393 ,  5.30773161, ...,  0.24955035,
        -0.48450355, -0.62285755],
       [ 6.07300903, -0.85078822,  0.17280319, ..., -0.62764574,
        -0.67894852,  0.73726127],
       ...,
       [ 4.18856991, -0.68824246,  0.21225385, ...,  0.88726112,
         6.19955807, -0.09918299],
       [-0.300117  ,  0.13080398, -0.54802214, ..., -0.3979886 ,
        -0.01474   ,  0.22398866],
       [-0.29279129,  0.62179137,  0.07317624, ...,  0.14508993,
        12.038684  ,  0.6437292 ]])

In [26]:
scaler.transform(X_valid)

array([[ 1.27745959, -0.79095855,  3.57879036, ...,  0.40431588,
         0.30027418,  0.45467042],
       [-0.35032922, -0.36662758, -0.83368479, ...,  0.48734142,
        -0.59656634,  9.97028335],
       [ 2.07990909,  0.20997962, -0.0127639 , ..., -0.80240405,
        -0.22697866,  0.23615553],
       ...,
       [ 0.80633635,  0.39936366,  0.49845399, ...,  0.68680344,
        -1.04204429,  0.24502101],
       [-0.06863608, -0.27843164, -0.36283929, ...,  0.18702425,
         0.57010813, -0.05371796],
       [-0.3620719 , -0.30144763, -0.06982129, ..., -0.73907299,
        -1.0792864 ,  0.24365673]])

In [27]:
lgb_model.fit(X_train, y_train)

LGBMClassifier(device_type='gpu', gpu_use_dp=False, max_bin=63,
               objective='binary', random_state=42)

In [28]:
preds = lgb_model.predict_proba(X_valid)[:,1]
valid_auc = roc_auc_score(y_true=y_valid, y_score=preds)
valid_auc

0.7322242958171294

In [29]:
wandb.log({'overall_valid_auc': valid_auc,
           'model_params': str(model.get_params()),
           'model_seed': 42,
          })
wandb.finish()

In [30]:
wandb.log({'overall_valid_auc': valid_auc,
           'model_params': str(lgb_model.get_params()),
           'model_seed': 42,
          })
wandb.finish()

In [31]:
exmodel_config['denoising'] = 'cleanlab'

wandb_config = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202111_Kaggle_tabular_playground',
    'tags': ['experiment'],
    'notes': "Going to try CleanLab with the 'best' params from the previous sweep, but wanting to see what the actual LB score is, and how it compares to the noisy one. Using a default LGBMClassfier with holdout on robust-scaled original dataset.",
    'config': exmodel_config,
}

wandb.init(
    project="202111_Kaggle_tabular_playground",
    save_code=True,
    tags=wandb_config['tags'],
    name=wandb_config['name'],
    notes=wandb_config['notes'],
    config=exmodel_config
) 

<wandb.sdk.wandb_run.Run at 0x7f73450cf130>

In [32]:
rp_params = {
    # 'prune_method': trial.suggest_categorical('prune_method', ['prune_by_noise_rate', 'prune_by_class', 'both']),
    # 'converge_latent_estimates': trial.suggest_categorical('converge_latent_estimates', [True, False]),
    # 'pulearning': trial.suggest_categorical('pulearning', [0,1,None])
    'prune_method': 'both',
    'converge_latent_estimates': True,
    'pulearning': 1,
}

In [33]:
lgb_model = LGBMClassifier(
        objective='binary',
        random_state=42,
        #                     device_type='cpu',
        #                     n_jobs=-1,
        #                 eval_metric='auc',
        device_type='gpu',
        max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
        gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
    )

In [34]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
scaler = RobustScaler()

In [36]:
scaler.fit_transform(X_train)

array([[-0.27296268,  0.01579322, -0.19031588, ...,  0.50363337,
         0.17720313, -0.58369832],
       [-0.18106382,  0.2410393 ,  5.30773161, ...,  0.24955035,
        -0.48450355, -0.62285755],
       [ 6.07300903, -0.85078822,  0.17280319, ..., -0.62764574,
        -0.67894852,  0.73726127],
       ...,
       [ 4.18856991, -0.68824246,  0.21225385, ...,  0.88726112,
         6.19955807, -0.09918299],
       [-0.300117  ,  0.13080398, -0.54802214, ..., -0.3979886 ,
        -0.01474   ,  0.22398866],
       [-0.29279129,  0.62179137,  0.07317624, ...,  0.14508993,
        12.038684  ,  0.6437292 ]])

In [37]:
scaler.transform(X_valid)

array([[ 1.27745959, -0.79095855,  3.57879036, ...,  0.40431588,
         0.30027418,  0.45467042],
       [-0.35032922, -0.36662758, -0.83368479, ...,  0.48734142,
        -0.59656634,  9.97028335],
       [ 2.07990909,  0.20997962, -0.0127639 , ..., -0.80240405,
        -0.22697866,  0.23615553],
       ...,
       [ 0.80633635,  0.39936366,  0.49845399, ...,  0.68680344,
        -1.04204429,  0.24502101],
       [-0.06863608, -0.27843164, -0.36283929, ...,  0.18702425,
         0.57010813, -0.05371796],
       [-0.3620719 , -0.30144763, -0.06982129, ..., -0.73907299,
        -1.0792864 ,  0.24365673]])

In [38]:
rp = LearningWithNoisyLabels(clf=lgb_model, **rp_params)
rp.fit(X_train, y_train)

In [39]:
X_train = np.array(X_train)
X_valid = np.array(X_valid)
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [40]:
rp = LearningWithNoisyLabels(clf=lgb_model, **rp_params)
rp.fit(X_train, y_train)

LGBMClassifier(device_type='gpu', gpu_use_dp=False, max_bin=63,
               objective='binary', random_state=42)

In [41]:
preds = rp.predict_proba(X_valid)[:,1]
valid_auc = roc_auc_score(y_true=y_valid, y_score=preds)
valid_auc

0.7242336321944994

In [42]:
rp.get_params()

{'clf__boosting_type': 'gbdt',
 'clf__class_weight': None,
 'clf__colsample_bytree': 1.0,
 'clf__importance_type': 'split',
 'clf__learning_rate': 0.1,
 'clf__max_depth': -1,
 'clf__min_child_samples': 20,
 'clf__min_child_weight': 0.001,
 'clf__min_split_gain': 0.0,
 'clf__n_estimators': 100,
 'clf__n_jobs': -1,
 'clf__num_leaves': 31,
 'clf__objective': 'binary',
 'clf__random_state': 42,
 'clf__reg_alpha': 0.0,
 'clf__reg_lambda': 0.0,
 'clf__silent': 'warn',
 'clf__subsample': 1.0,
 'clf__subsample_for_bin': 200000,
 'clf__subsample_freq': 0,
 'clf__device_type': 'gpu',
 'clf__max_bin': 63,
 'clf__gpu_use_dp': False,
 'clf': LGBMClassifier(device_type='gpu', gpu_use_dp=False, max_bin=63,
                objective='binary', random_state=42),
 'converge_latent_estimates': True,
 'cv_n_folds': 5,
 'n_jobs': 16,
 'prune_method': 'both',
 'pulearning': 1,
 'seed': None}

In [43]:
wandb.log({'overall_valid_auc': valid_auc,
           'model_params': str(lgb_model.get_params()),
           'model_seed': 42,
           'cleanlab_params': str(rp.get_params())
          })
wandb.finish()