# denoising_20211109
Taking ideas from [this Bryan Arnold notebook](https://www.kaggle.com/puremath86/label-correction-experiments-tps-nov-21), which does a few cool things but mainly attempts denoising the labels with `cleanlab`

In [2]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [1]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random
import datatable as dt
import gc; gc.enable()

- `gc` is the garbage collection interface in Python; he uses it to optimize memory utilization, in tandem with `del` statements.

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"nb_{datetime.now().strftime('%Y%m%d')}.ipynb"

Now, non-stdlib imports

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

from BorutaShap import BorutaShap

In [5]:
# from pytorch_widedeep import Trainer
# from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
# from pytorch_widedeep.models import Wide, TabMlp, WideDeep#, SAINT, TabTransformer, TabNet, TabFastFormer, TabResnet
# from pytorch_widedeep.metrics import Accuracy
# from torchmetrics import AUROC
# import torch
# from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
# from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
# from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

Now, datapath setup

In [5]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [datapath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


In [6]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

The following function is used to optimize dataset memory utilization in RAM

In [7]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

- Basically, you start by creating a list of the numeric types you may be using, and initializing the memory utilization (so you can see at the end how much you've saved). 
- Then, you iterate over the columns (`for col in df.columns:`) and check if the column's [[data type]] is numeric; if it is, then you store the minimum and maximum values for the column (since one of them is guaranteed to be the biggest memory-hog of the bunch, though which depends on the feature's range). 
- Then, you start going through a series of case statements, deciding whether the datatype is an integer or not, and then proceeding through the different [[NumPy]] integer or float types in increasing size, [[typecasting]] when you come to the first size that will accommodate both the max and min value from the column. (*Note here that `np.iinfo(np.int64).min` et al will apparently return a value rather than an absolute size in bits, which is fine.*)
- Then, you find out what the final memory size is, and print out how big of a savings you've obtained.

In [10]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    'train_source': datapath/'X_orig.feather',
    'target_source': datapath/'y_orig.joblib',
    # 'test_source': str(datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
X = pd.read_feather(dataset_params['train_source'])
y = load(dataset_params['target_source'])
# X_test = load(dataset_params['test_source'])

dataset_params['feature_count'] = X.shape[1]
dataset_params['instance_count'] = X.shape[0]
    

In [10]:
# # dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
# dataset_params = {
#     'train_source': str(datapath/'X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
#     'target_source': str(datapath/'y_orig.joblib'),
#     'test_source': str(datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
#     'scaler': str(RobustScaler()),
#     'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
#     'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
# }   

# # referring back to the already-entered attributes, specify how the pipeline was sequenced
# # dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# # now, load the datasets and generate more metadata from them
# X = load(dataset_params['train_source'])
# y = load(dataset_params['target_source'])
# X_test = load(dataset_params['test_source'])

# dataset_params['feature_count'] = X.shape[1]
# dataset_params['instance_count'] = X.shape[0]
    

## Ex-Model Config

In [11]:
# # meta-config for preprocessing and cross-validation, but NOT for model parameters
# exmodel_config = {
# #     "feature_selector": SelectKBest,
# #     "k_best": 80,
# #     "feature_selection_scoring": f_regression,
# #     'random_state': SEED,
# #     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
# #     'subsample': 1,
#     'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
#     'kfolds': 5, # if 1, that means just doing holdout
#     'test_size': 0.2,
#     **dataset_params
# #     'features_created': False,
# #     'feature_creator': None,
# }

In [11]:
X.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,0.010739,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,0.135838,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719
2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,0.11731,4.883,0.085222,0.032396,0.116092,-0.001688,-0.520069,2.14112,0.124464,0.148209
3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,-0.015347,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873
4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,0.013781,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798


In [13]:
X_robust = RobustScaler().fit_transform(X)
X_robust = pd.DataFrame(X_robust, columns=X.columns)
X_robust.head()


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.023869,0.414343,-0.003178,0.223129,0.219181,-0.549174,0.356604,-0.117173,-0.149785,-0.569723,...,-0.615937,-0.519509,-0.523,-1.07358,-0.111175,0.042435,0.625515,-0.282294,0.287257,2.097848
1,0.073411,-0.324111,-0.220699,0.301799,0.406584,0.980651,-0.58429,-1.216782,0.824004,-0.258296,...,1.142983,0.432846,-0.487967,1.07122,0.943432,7.115126,-1.115475,-0.041835,-0.812236,-0.388992
2,-0.165673,-0.391725,0.386256,-0.178365,-0.372808,0.210182,0.863859,0.518747,-0.268295,-0.021567,...,0.882475,1.006638,0.153544,-0.380862,0.548134,-0.83391,-1.213478,-0.217131,0.683318,1.034235
3,-0.301555,-0.872802,2.498527,-0.301544,-0.587486,-0.414987,-0.039545,0.787011,0.807038,0.794548,...,-0.982719,0.43858,-0.809415,-1.016802,-0.014837,-0.273769,-0.812462,-0.286376,-0.185636,-0.156724
4,-0.272393,0.460876,0.086985,-0.197278,-0.465605,-0.192678,0.518429,-1.042825,0.356489,-0.30174,...,-0.573168,-0.192062,-1.052588,0.768968,0.641618,-0.321887,-0.60563,-0.64512,-0.729316,0.165118


In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
X_standard = StandardScaler().fit_transform(X)
X_standard = pd.DataFrame(X_standard, columns=X.columns)
X_standard.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,-0.382553,0.705772,-0.315075,0.347277,-0.229657,-0.87566,0.660314,-0.197064,-0.286162,-0.28927,...,-0.537157,-0.872508,-0.258806,-0.595537,-0.199502,-0.196145,1.067358,-0.400887,-0.167145,0.443374
1,-0.347377,-0.530387,-0.417061,0.472862,-0.187909,1.623543,-0.910506,-1.96398,1.309644,-0.229122,...,0.573313,0.658473,-0.252018,0.548089,0.019765,2.392938,-1.806811,-0.008064,-0.41211,-0.371198
2,-0.517136,-0.643571,-0.132486,-0.29365,-0.361533,0.364863,1.507175,0.824771,-0.480372,-0.183401,...,0.408845,1.580886,-0.127714,-0.226174,-0.062423,-0.516946,-1.968603,-0.294434,-0.078904,0.094984
3,-0.613619,-1.448884,0.857867,-0.490286,-0.409357,-0.656445,-0.001055,1.255833,1.281843,-0.02578,...,-0.768719,0.667692,-0.314304,-0.565262,-0.179472,-0.311897,-1.306572,-0.407556,-0.272505,-0.295118
4,-0.592913,0.783666,-0.272802,-0.323841,-0.382205,-0.29327,0.93048,-1.684456,0.543499,-0.237512,...,-0.510155,-0.346112,-0.361423,0.386926,-0.042986,-0.329511,-0.965117,-0.993613,-0.393636,-0.189697


In [16]:
from sklearn.preprocessing import PowerTransformer

In [17]:
X_power = PowerTransformer().fit_transform(X)
X_power = pd.DataFrame(X_power, columns=X.columns)
X_power.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,-0.330764,0.720239,-0.185486,0.362574,-0.156842,-0.853394,0.667399,-0.16051,-0.234598,-0.357589,...,-0.565506,-0.847819,-0.307379,-0.927254,-0.19451,-0.098997,1.055397,-0.388207,-0.105342,0.724264
1,-0.272488,-0.486992,-0.313108,0.485275,-0.087189,1.558091,-0.899712,-2.103362,1.270651,-0.229298,...,0.641452,0.676891,-0.297071,0.867816,0.251454,2.603714,-1.911892,0.00426,-0.532986,-0.400761
2,-0.567016,-0.604689,0.0209,-0.272131,-0.388096,0.395922,1.479006,0.828781,-0.432729,-0.135055,...,0.468604,1.513583,-0.111917,-0.323559,0.09144,-0.786038,-2.114633,-0.281473,0.036836,0.28366
3,-0.750805,-1.4997,0.958682,-0.470166,-0.476486,-0.621742,0.019516,1.230873,1.245859,0.170004,...,-0.829848,0.685489,-0.392448,-0.876844,-0.151133,-0.331031,-1.324488,-0.394903,-0.282993,-0.280477
4,-0.710261,0.792576,-0.135717,-0.302424,-0.425993,-0.249634,0.928278,-1.759618,0.571015,-0.246894,...,-0.534985,-0.297603,-0.465794,0.627528,0.130013,-0.367829,-0.951063,-0.988675,-0.498888,-0.119675


In [20]:
from sklearn.preprocessing import QuantileTransformer
X_quantile = QuantileTransformer().fit_transform(X)
X_quantile = pd.DataFrame(X_quantile, columns=X.columns)
X_quantile.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.524853,0.708279,0.498227,0.601757,0.615981,0.235575,0.689244,0.445161,0.4307,0.208504,...,0.188796,0.226388,0.225639,0.06287,0.440959,0.521712,0.829877,0.374985,0.646461,0.884742
1,0.56921,0.349651,0.33973,0.640245,0.699997,0.954619,0.22794,0.010521,0.888431,0.360631,...,0.901757,0.702314,0.24231,0.945676,0.873161,0.969689,0.020048,0.481632,0.101632,0.288204
2,0.285286,0.313865,0.702259,0.41375,0.292967,0.601331,0.947447,0.755695,0.366651,0.488799,...,0.860375,0.943617,0.584301,0.308015,0.760965,0.105666,0.008053,0.405745,0.792971,0.855855
3,0.116842,0.07178,0.894046,0.348115,0.183657,0.304618,0.482549,0.880025,0.882009,0.840215,...,0.064177,0.705091,0.112006,0.075571,0.491928,0.353474,0.108497,0.373276,0.394803,0.413024
4,0.14844,0.734622,0.556966,0.403929,0.243439,0.413076,0.785829,0.038596,0.660243,0.33747,...,0.206515,0.402987,0.047811,0.86064,0.794711,0.32827,0.207874,0.196023,0.129622,0.587897


In [21]:
from sklearn.preprocessing import QuantileTransformer
X_quantile_norm = QuantileTransformer(output_distribution='normal').fit_transform(X)
X_quantile_norm = pd.DataFrame(X_quantile_norm, columns=X.columns)
X_quantile_norm.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.058439,0.545402,-0.00652,0.251106,0.293641,-0.723694,0.493748,-0.131654,-0.178317,-0.806803,...,-0.88856,-0.756515,-0.755013,-1.538723,-0.150822,0.052447,0.945523,-0.313074,0.379596,1.205548
1,0.173059,-0.383496,-0.415184,0.352251,0.519891,1.703883,-0.747889,-2.296824,1.225082,-0.355518,...,1.303408,0.529645,-0.702139,1.610642,1.152154,1.888093,-2.050964,-0.040877,-1.259755,-0.557536
2,-0.568764,-0.482315,0.532258,-0.230806,-0.552589,0.260788,1.619715,0.69535,-0.343726,-0.030215,...,1.082079,1.594912,0.208438,-0.505334,0.712735,-1.239604,-2.404666,-0.234336,0.808151,1.066985
3,-1.187532,-1.458381,1.245232,-0.400038,-0.904151,-0.511822,-0.041367,1.174559,1.191639,0.990184,...,-1.520453,0.53757,-1.220441,-1.444288,-0.01974,-0.382473,-1.234826,-0.318433,-0.259399,-0.215769
4,-1.042563,0.624011,0.140354,-0.256014,-0.700099,-0.216489,0.793705,-1.767176,0.413965,-0.417051,...,-0.8202,-0.244171,-1.668502,1.08468,0.828077,-0.450044,-0.815142,-0.849233,-1.116413,0.223979
