Header for notebooks -- customize as required.

In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"sweep_cleanlab_{datetime.now().strftime('%Y%m%d')}.ipynb"

Now, non-stdlib imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

# from BorutaShap import BorutaShap
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
from optuna.samplers import TPESampler
from sklearn.utils import resample

In [5]:
from cleanlab.classification import LearningWithNoisyLabels

In [6]:
# from pytorch_widedeep import Trainer
# from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
# from pytorch_widedeep.models import Wide, TabMlp, WideDeep#, SAINT, TabTransformer, TabNet, TabFastFormer, TabResnet
# from pytorch_widedeep.metrics import Accuracy
# from torchmetrics import AUROC
# import torch
# from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
# from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
# from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [7]:
# import category_encoders as ce

Now, datapath setup

In [8]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [datapath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


## Helpers

In [9]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

In [10]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

## Metadata

In [11]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    # 'train_source': str(datapath/'X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'train_source': str(datapath/'X_orig.feather'),
    'target_source': str(datapath/'y_orig.joblib'),
    # 'test_source': str(datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'test_source': str(datapath/'X_test_orig-no_scaling.feather'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
# X = load(dataset_params['train_source'])
X = pd.read_feather(dataset_params['train_source'])
y = load(dataset_params['target_source'])
# X_test = load(dataset_params['test_source'])
# X_test = pd.read_feather(dataset_params['test_source'])

dataset_params['feature_count'] = X.shape[1]
dataset_params['instance_count'] = X.shape[0]
    

In [12]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
# in the sweep version, this includes both ex-model parameters and defaults for model parameters
exmodel_config = {
    "arch": 'lightgbm',
    "type": 'sweep',
    "denoising": "cleanlab",
    "level": 1,
    'random_state': SEED,
    'tuner': "Optuna",
    'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
    'kfolds': 1, # if 1, that means just doing holdout
    'test_size': 0.2,
    'scaler': str(RobustScaler()),
    **dataset_params
}

wandb_kwargs = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202111_Kaggle_tabular_playground',
    'tags': ['sweep'],
    'notes': "Sweep for CleanLab hyperparams",
    'config': exmodel_config,
}

In [13]:
lgb_model = LGBMClassifier(
        objective='binary',
        random_state=42,
        #                     device_type='cpu',
        #                     n_jobs=-1,
        #                 eval_metric='auc',
        device_type='gpu',
        max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
        gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
    )

In [14]:
def objective(trial, X=X, y=y, model=lgb_model):
    X = np.array(X)
    y = np.array(y)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    rp_params = {
        'prune_method': trial.suggest_categorical('prune_method', ['prune_by_noise_rate', 'prune_by_class', 'both']),
        'converge_latent_estimates': trial.suggest_categorical('converge_latent_estimates', [True, False]),
        'pulearning': trial.suggest_categorical('pulearning', [0,1,None])
    }
    
    
    rp = LearningWithNoisyLabels(clf=lgb_model, **rp_params)
    rp.fit(X_train, y_train)
    preds = rp.predict_proba(X_valid)[:,1]
    valid_auc = roc_auc_score(y_true=y_valid, y_score=preds)
    print(f"Valid AUC score for is {valid_auc}")
    return valid_auc
    

In [15]:
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)

  wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)
[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


In [16]:
study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed=int(SEED)), study_name='cleanlab_20211118')
# study = load()

[32m[I 2021-11-18 14:59:38,878][0m A new study created in memory with name: cleanlab_20211118[0m


In [17]:
# for x in range(1, 250):
#     study.optimize(objective, n_trials = 2, callbacks = [wandbc]) #n_jobs = multiprocessing.cpu_count())
#     dump(study, filename=studypath/f'optuna-dataset_study-feature_reduction-20211108.joblib')

study.optimize(objective, n_trials=18, callbacks=[wandbc])

[32m[I 2021-11-18 14:59:54,501][0m Trial 0 finished with value: 0.7202770549886881 and parameters: {'prune_method': 'prune_by_class', 'converge_latent_estimates': True, 'pulearning': None}. Best is trial 0 with value: 0.7202770549886881.[0m


Valid AUC score for is 0.7202770549886881


[32m[I 2021-11-18 15:00:08,040][0m Trial 1 finished with value: 0.7197124054723177 and parameters: {'prune_method': 'prune_by_class', 'converge_latent_estimates': True, 'pulearning': 0}. Best is trial 0 with value: 0.7202770549886881.[0m


Valid AUC score for is 0.7197124054723177


[32m[I 2021-11-18 15:00:22,779][0m Trial 2 finished with value: 0.720419834660914 and parameters: {'prune_method': 'prune_by_class', 'converge_latent_estimates': False, 'pulearning': None}. Best is trial 2 with value: 0.720419834660914.[0m


Valid AUC score for is 0.720419834660914


[32m[I 2021-11-18 15:00:36,199][0m Trial 3 finished with value: 0.7192350353289598 and parameters: {'prune_method': 'prune_by_class', 'converge_latent_estimates': False, 'pulearning': 1}. Best is trial 2 with value: 0.720419834660914.[0m


Valid AUC score for is 0.7192350353289598


[32m[I 2021-11-18 15:00:49,879][0m Trial 4 finished with value: 0.724119967966238 and parameters: {'prune_method': 'both', 'converge_latent_estimates': True, 'pulearning': 1}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.724119967966238


[32m[I 2021-11-18 15:01:02,968][0m Trial 5 finished with value: 0.7198632599410275 and parameters: {'prune_method': 'prune_by_class', 'converge_latent_estimates': True, 'pulearning': 0}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7198632599410275


[32m[I 2021-11-18 15:01:16,366][0m Trial 6 finished with value: 0.7203339047455115 and parameters: {'prune_method': 'both', 'converge_latent_estimates': False, 'pulearning': None}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7203339047455115


[32m[I 2021-11-18 15:01:29,047][0m Trial 7 finished with value: 0.7193737491245816 and parameters: {'prune_method': 'prune_by_class', 'converge_latent_estimates': False, 'pulearning': 1}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7193737491245816


[32m[I 2021-11-18 15:01:42,447][0m Trial 8 finished with value: 0.7196990737090969 and parameters: {'prune_method': 'prune_by_class', 'converge_latent_estimates': True, 'pulearning': 0}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7196990737090969


[32m[I 2021-11-18 15:01:55,972][0m Trial 9 finished with value: 0.7197984287411806 and parameters: {'prune_method': 'prune_by_class', 'converge_latent_estimates': False, 'pulearning': 1}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7197984287411806


[32m[I 2021-11-18 15:02:08,563][0m Trial 10 finished with value: 0.7198927801944912 and parameters: {'prune_method': 'prune_by_noise_rate', 'converge_latent_estimates': True, 'pulearning': None}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7198927801944912


[32m[I 2021-11-18 15:02:21,352][0m Trial 11 finished with value: 0.7204227244506201 and parameters: {'prune_method': 'both', 'converge_latent_estimates': False, 'pulearning': None}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7204227244506201


[32m[I 2021-11-18 15:02:34,089][0m Trial 12 finished with value: 0.7201589725856447 and parameters: {'prune_method': 'both', 'converge_latent_estimates': True, 'pulearning': None}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7201589725856447


[32m[I 2021-11-18 15:02:48,030][0m Trial 13 finished with value: 0.7204413167924674 and parameters: {'prune_method': 'both', 'converge_latent_estimates': False, 'pulearning': None}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7204413167924674


[32m[I 2021-11-18 15:03:00,956][0m Trial 14 finished with value: 0.7200748074951835 and parameters: {'prune_method': 'both', 'converge_latent_estimates': False, 'pulearning': None}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7200748074951835


[32m[I 2021-11-18 15:03:13,890][0m Trial 15 finished with value: 0.7201925456573416 and parameters: {'prune_method': 'both', 'converge_latent_estimates': True, 'pulearning': None}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7201925456573416


[32m[I 2021-11-18 15:03:26,526][0m Trial 16 finished with value: 0.7201536703313662 and parameters: {'prune_method': 'prune_by_noise_rate', 'converge_latent_estimates': True, 'pulearning': None}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7201536703313662


[32m[I 2021-11-18 15:03:39,327][0m Trial 17 finished with value: 0.7202078992453572 and parameters: {'prune_method': 'both', 'converge_latent_estimates': False, 'pulearning': None}. Best is trial 4 with value: 0.724119967966238.[0m


Valid AUC score for is 0.7202078992453572


In [18]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
converge_latent_estimates,██▁▁██▁▁█▁█▁█▁▁██▁
pulearning,▁██▁█▁█
value,▂▂▃▁█▂▃▁▂▂▂▃▂▃▂▂▂▂

0,1
converge_latent_estimates,False
prune_method,both
value,0.72021


In [19]:
study.best_params

{'prune_method': 'both', 'converge_latent_estimates': True, 'pulearning': 1}

In [21]:
dump(study, Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/studies/cleanlab_lgboost_20211118.joblib')) 

['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/studies/cleanlab_lgboost_20211118.joblib']