Header for notebooks -- customize as required.

In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
USE_GPU = True 

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"nb_{datetime.now().strftime('%Y%m%d')}.ipynb"

Now, non-stdlib imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler #StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft

from BorutaShap import BorutaShap

In [5]:
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT#, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from torchmetrics import AUROC
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [6]:
# import category_encoders as ce

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Now, datapath setup

In [7]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/nov2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [datapath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


## Helpers

In [8]:
SEED = 42

# Function to seed everything
def seed_everything(seed, reproducible=True):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    # torch.manual_seed(seed) # set torch CPU seed
    # if torch.cuda.is_available():
    #     torch.cuda.manual_seed_all(seed) # set torch GPU(s) seed(s)
    # if reproducible and torch.backends.cudnn.is_available():
    #     torch.backends.cudnn.deterministic = True
    #     torch.backends.cudnn.benchmark = False

seed_everything(seed=SEED)

In [9]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

## Metadata

In [10]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    # 'train_source': str(datapath/'X-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'train_source': str(datapath/'X_orig.feather'),
    'target_source': str(datapath/'y_corrected.joblib'),
    # 'test_source': str(datapath/'X_test-RobustScaled-pca_mle-umap_embedding_20211107-n_comp10-n_neighbors15-rs42.joblib'),
    'test_source': str(datapath/'X_test_orig-no_scaling.feather'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
# X = load(dataset_params['train_source'])
X = pd.read_feather(dataset_params['train_source'])
y = load(dataset_params['target_source'])
# X_test = load(dataset_params['test_source'])
X_test = pd.read_feather(dataset_params['test_source'])

dataset_params['feature_count'] = X.shape[1]
dataset_params['instance_count'] = X.shape[0]
    

In [11]:
# # meta-config for preprocessing and cross-validation, but NOT for model parameters
# exmodel_config = {
# #     "feature_selector": SelectKBest,
# #     "k_best": 80,
# #     "feature_selection_scoring": f_regression,
# #     'random_state': SEED,
# #     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
# #     'subsample': 1,
#     # 'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
#     # 'kfolds': 5, # if 1, that means just doing holdout
#     # 'test_size': 0.2,
#     # **dataset_params
# #     'features_created': False,
# #     'feature_creator': None,
# }

In [12]:
# oof_preds_saint_fold0 = load(predpath/f'widedeep_saint-20211127-60epochs-fold0-oofpreds.joblib')

In [13]:
# oof_preds_saint_fold4 = load(predpath/f'widedeep_saint-20211127-55epochs-fold4-oofpreds.joblib')

In [14]:
# type(oof_preds_saint_fold0)

In [15]:
# oof_preds_saint_fold0.shape

In [16]:
# oof_preds_saint_fold4.shape

In [17]:
# test_preds_saint_fold0 = load(predpath/f'widedeep_saint-20211127-60epochs-fold0-testpreds.joblib')
# test_preds_saint_fold4 = load(predpath/f'widedeep_saint-20211127-55epochs-fold4-testpreds.joblib')


In [18]:
# test_preds_saint_fold0.shape, test_preds_saint_fold4.shape

In [19]:
# df = pd.DataFrame({0: test_preds_saint_fold0, 4: test_preds_saint_fold4})
# df

In [20]:
# df.hist()

In [21]:
oof_preds_saint = pd.DataFrame()#{ fold: load(predpath/f'widedeep_saint-20211127-60epochs-fold{fold}-oofpreds.joblib') for fold in range(4)}, columns=[f'saint_f{fold}' for fold in range(4)])
test_preds_saint = pd.DataFrame()

for fold in range(5):
    oof_preds_saint[f'saint_f{fold}'] = load(predpath/f'widedeep_saint-20211127-60epochs-fold{fold}-oofpreds.joblib')
    test_preds_saint[f'saint_f{fold}'] = load(predpath/f'widedeep_saint-20211127-60epochs-fold{fold}-testpreds.joblib')
    
# oof_preds_saint['saint_f4'] = load(predpath/'widedeep_saint-20211127-55epochs-fold4-oofpreds.joblib')
# test_preds_saint = pd.DataFrame({ fold: load(predpath/f'widedeep_saint-20211127-60epochs-fold{fold}-testpreds.joblib') for fold in range(4)}, columns=[f'saint_f{fold}' for fold in range(4)])
# test_preds_saint['saint_f4'] = load(predpath/'widedeep_saint-20211127-55epochs-fold4-testpreds.joblib')

In [22]:
test_preds_saint.corr()

Unnamed: 0,saint_f0,saint_f1,saint_f2,saint_f3,saint_f4
saint_f0,1.0,0.949944,0.957758,0.931124,0.961689
saint_f1,0.949944,1.0,0.944971,0.922293,0.950835
saint_f2,0.957758,0.944971,1.0,0.93881,0.959757
saint_f3,0.931124,0.922293,0.93881,1.0,0.936174
saint_f4,0.961689,0.950835,0.959757,0.936174,1.0


In [23]:
test_preds_saint.iloc[60160:60288, :] # this is all the missing ones

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,saint_f0,saint_f1,saint_f2,saint_f3,saint_f4
60160,,,,,
60161,,,,,
60162,,,,,
60163,,,,,
60164,,,,,
...,...,...,...,...,...
60283,,,,,
60284,,,,,
60285,,,,,
60286,,,,,


In [24]:
X_test.iloc[60160]

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


f0       0.165661
f1       2.433540
f2     118.158000
f3       0.530859
f4       0.879530
          ...    
f95      0.036365
f96      3.534870
f97      2.322490
f98      0.121823
f99      0.058785
Name: 60160, Length: 100, dtype: float64

In [25]:
y[60160:60293]

60160    1
60161    0
60162    1
60163    1
60164    1
        ..
60288    1
60289    0
60290    1
60291    1
60292    0
Name: corrected_target, Length: 133, dtype: int64

In [26]:
oof_preds_saint.corr()

Unnamed: 0,saint_f0,saint_f1,saint_f2,saint_f3,saint_f4
saint_f0,1.0,0.08017,0.082984,0.068409,0.085974
saint_f1,0.08017,1.0,0.077841,0.060979,0.079138
saint_f2,0.082984,0.077841,1.0,0.06544,0.079452
saint_f3,0.068409,0.060979,0.06544,1.0,0.069823
saint_f4,0.085974,0.079138,0.079452,0.069823,1.0


Let's generate a real OOF pred df

In [27]:
# X_wide = load(datapath/'X_wide.joblib')
# X_tab = load(datapath/'X_tab.joblib')
# cont_cols = load(datapath/'cont_cols.joblib')
# tab_preprocessor = load(datapath/'tab_preprocessor.joblib')

# # train_preds = pd.DataFrame()
# train_preds = load(datapath/'saint_oofpreds.joblib')

# for fold in range(2,5):
#     wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
#     deeptabular = SAINT(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx,)
#     model = WideDeep(wide=wide, deeptabular=deeptabular)
#     model.load_state_dict(torch.load(datapath/f"widedeep_saint-20211127-weights-60epochs-fold{fold}/wd_model.pt"))
#     wide_opt = AdamW(model.wide.parameters(),)
#     deep_opt = SGD(model.deeptabular.parameters(),  lr=0.01, momentum=0.75)

#     wide_sch = CosineAnnealingWarmRestarts(optimizer=wide_opt, T_0=5) 
#     deep_sch = ReduceLROnPlateau(optimizer=deep_opt, )

#     # deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_train_tab.shape[0], epochs=n_epochs)

#     # optimizers = {'deeptabular': deep_opt }
#     # lr_schedulers = {'deeptabular': deep_sch }

#     optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
#     lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }

#     callbacks = [
#         LRHistory(n_epochs=60), 
#     ]

#     # trainer
#     trainer = Trainer(model=model, 
#                       objective='binary', 
#                       metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
#                       seed=42, 
#                       optimizers=optimizers,
#                       callbacks=callbacks
#                      )
    
#     train_preds[f'saint_f{fold}'] = trainer.predict_proba(X_wide=np.array(X_wide), X_tab=np.array(X_tab), batch_size=128)[:,1]


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [28]:
# train_preds.iloc[60160:60288, :]

In [29]:
# dump(train_preds, datapath/'saint_oofpreds.joblib')
train_preds = load(datapath/'saint_oofpreds.joblib')

In [30]:
X_wide = load(datapath/'X_wide.joblib')
X_tab = load(datapath/'X_tab.joblib')
X_wide_test = load(datapath/'X_wide_test_FIXED.joblib')
X_tab_test = load(datapath/'X_tab_test_FIXED.joblib')
cont_cols = load(datapath/'cont_cols.joblib')
tab_preprocessor = load(datapath/'tab_preprocessor.joblib')

# test_preds = pd.DataFrame()
test_preds = load(predpath/'saint_fixed_testpreds_20211130.joblib')

In [31]:
# test_preds

array([2.5462423 , 2.54651989, 2.54621801, ..., 2.56285044, 2.54622797,
       2.54645533])

In [32]:
test_preds = pd.DataFrame()

In [36]:
# for fold in range(3,5):
#     wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
#     deeptabular = SAINT(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx,)
#     model = WideDeep(wide=wide, deeptabular=deeptabular)
#     model.load_state_dict(torch.load(datapath/f"widedeep_saint-20211127-weights-60epochs-fold{fold}/wd_model.pt"))
#     wide_opt = AdamW(model.wide.parameters(),)
#     deep_opt = SGD(model.deeptabular.parameters(),  lr=0.01, momentum=0.75)

#     wide_sch = CosineAnnealingWarmRestarts(optimizer=wide_opt, T_0=5) 
#     deep_sch = ReduceLROnPlateau(optimizer=deep_opt, )

#     # deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_train_tab.shape[0], epochs=n_epochs)

#     # optimizers = {'deeptabular': deep_opt }
#     # lr_schedulers = {'deeptabular': deep_sch }

#     optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
#     lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }

#     callbacks = [
#         LRHistory(n_epochs=60), 
#     ]

#     # trainer
#     trainer = Trainer(model=model, 
#                       objective='binary', 
#                       metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
#                       seed=42, 
#                       optimizers=optimizers,
#                       callbacks=callbacks
#                      )
    
#     test_preds[f'saint_f{fold}'] = trainer.predict_proba(X_wide=np.array(X_wide_test), X_tab=np.array(X_tab_test), batch_size=128)[:,1]
# # dump(test_preds, 'saint_fixed_testpreds_20211130.joblib')

predict: 100%|██████████| 4219/4219 [01:28<00:00, 47.64it/s]
predict: 100%|██████████| 4219/4219 [01:29<00:00, 47.16it/s]


In [48]:
missing_wide = X_wide_test[60160:60288, :]
missing_tab = X_tab_test[60160:60288, :]
missing_test_preds = pd.DataFrame()

# missing batch
for fold in range(5):
    wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
    deeptabular = SAINT(continuous_cols=cont_cols, column_idx=tab_preprocessor.column_idx,)
    model = WideDeep(wide=wide, deeptabular=deeptabular)
    model.load_state_dict(torch.load(datapath/f"widedeep_saint-20211127-weights-60epochs-fold{fold}/wd_model.pt"))
    wide_opt = AdamW(model.wide.parameters(),)
    deep_opt = SGD(model.deeptabular.parameters(),  lr=0.01, momentum=0.75)

    wide_sch = CosineAnnealingWarmRestarts(optimizer=wide_opt, T_0=5) 
    deep_sch = ReduceLROnPlateau(optimizer=deep_opt, )

    # deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_train_tab.shape[0], epochs=n_epochs)

    # optimizers = {'deeptabular': deep_opt }
    # lr_schedulers = {'deeptabular': deep_sch }

    optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
    lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }

    callbacks = [
        LRHistory(n_epochs=60), 
    ]

    # trainer
    trainer = Trainer(model=model, 
                      objective='binary', 
                      metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                      seed=42, 
                      optimizers=optimizers,
                      callbacks=callbacks
                     )
    
    missing_test_preds[f'saint_f{fold}'] = trainer.predict_proba(X_wide=np.array(missing_wide), X_tab=np.array(missing_tab), batch_size=128)[:,1]
# dump(test_preds, 'saint_fixed_testpreds_20211130.joblib')

RuntimeError: CUDA out of memory. Tried to allocate 158.00 MiB (GPU 0; 7.79 GiB total capacity; 6.66 GiB already allocated; 116.06 MiB free; 6.81 GiB reserved in total by PyTorch)

In [37]:
test_preds

Unnamed: 0,saint_f0,saint_f1,saint_f2,saint_f3,saint_f4
0,1.000000,1.000000,1.000000,1.000000,1.000000
1,1.000000,1.000000,1.000000,1.000000,1.000000
2,1.000000,1.000000,1.000000,0.999999,1.000000
3,0.980476,0.227577,0.921866,0.999973,0.966152
4,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...
539995,1.000000,1.000000,1.000000,1.000000,1.000000
539996,1.000000,1.000000,1.000000,1.000000,1.000000
539997,0.236990,0.126843,0.017522,0.626631,0.000961
539998,1.000000,1.000000,1.000000,1.000000,1.000000


In [39]:
dump(test_preds, predpath/'saint_fixed_testpreds_20211130b.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/preds/saint_fixed_testpreds_20211130b.joblib']

In [32]:
test_preds.shape

(540000,)

In [None]:
dump(test_preds, predpath/'saint_fixed_testpreds_20211130.joblib')


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/preds/saint_fixed_testpreds_20211130.joblib']

In [35]:
test_preds.iloc[60160:60288, :]

Unnamed: 0,saint_f0,saint_f1,saint_f2,saint_f3
60160,,,,
60161,,,,
60162,,,,
60163,,,,
60164,,,,
...,...,...,...,...
60283,,,,
60284,,,,
60285,,,,
60286,,,,


# Logistic Regression

In [36]:
from sklearn import model_selection

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [37]:
kfold = model_selection.StratifiedKFold(n_splits=5, shuffle=False) # no random_state if shuffle == False

In [38]:
oof_preds, oof_y = [], []

In [39]:
test_preds = np.zeros((X_test.shape[0]))

In [40]:
test_preds.shape

(540000,)

In [48]:
X = load(predpath/'oof_gbms.joblib')
X_test = load(predpath/'testpreds_gbms.joblib')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [49]:
X_test.corr()

Unnamed: 0,xgboost42,lightgbm42,catboost42
xgboost42,1.0,0.982128,0.997896
lightgbm42,0.982128,1.0,0.983802
catboost42,0.997896,0.983802,1.0


In [51]:
X_test

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,xgboost42,lightgbm42,catboost42
0,0.999973,0.999545,0.999845
1,0.997947,0.991481,0.995011
2,1.000000,0.999999,1.000000
3,0.406173,0.652934,0.393805
4,0.988284,0.961683,0.979108
...,...,...,...
539995,0.999953,0.996887,0.999750
539996,0.999740,0.995996,0.999138
539997,0.575609,0.506072,0.541775
539998,0.999912,0.999435,0.999613


In [52]:
X_test['mean'] = (X_test['xgboost42'] + X_test['lightgbm42'] + X_test['catboost42']) / 3

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [53]:
dump(X_test, datapath/'gbms_test_preds_with_mean.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/datasets/gbms_test_preds_with_mean.joblib']

In [36]:
# X = np.array(train_preds)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [42]:
X.shape

(600000, 3)

In [43]:
type(X)

pandas.core.frame.DataFrame

In [44]:
X_test.shape

(540000, 3)

In [45]:
X = np.array(X)
X_test = np.array(X_test)

In [None]:
X_test = 

In [47]:
for fold, (train_idx, valid_idx) in enumerate(kfold.split(X,y)):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]
    
    print(f"FOLD {fold}")
    print("---------------------")
    
    model = LogisticRegression(max_iter=1000)
    
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_valid)[:,1]
    
    oof_preds.extend(preds)
    oof_y.extend(y_valid)
    
    test_preds += model.predict_proba(X_test)[:,1]
    
    valid_auc = roc_auc_score(y_valid, preds)
    print(f"ROC AUC of fold {fold} is {valid_auc}")
    
#     dump(preds, /'lv_3)

valid_auc_total = roc_auc_score(oof_y, oof_preds)
print(f"Overall ROC_AUC is {valid_auc_total}")

dump(oof_preds, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-saint_oof_lv2_preds.joblib")
dump(oof_y, predpath/'oof_lv2_y.joblib')

test_preds /= 5
dump(test_preds, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-saint_test_lv2_preds.joblib")

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


FOLD 0
---------------------
ROC AUC of fold 0 is 0.4759910308568518
FOLD 1
---------------------
ROC AUC of fold 1 is 0.501158578113442
FOLD 2
---------------------
ROC AUC of fold 2 is 0.49716601957318995
FOLD 3
---------------------
ROC AUC of fold 3 is 0.4908806376156154
FOLD 4
---------------------
ROC AUC of fold 4 is 0.5037770343269935
Overall ROC_AUC is 0.4898580822986335


NameError: name 'wandb_config' is not defined

In [57]:
test_preds_saint.head()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,saint_f0,saint_f1,saint_f2,saint_f3,saint_f4
0,0.997013,0.999735,1.0,0.999988,1.0
1,0.999889,0.999998,0.999999,0.999996,1.0
2,0.999999,1.0,0.999996,0.992245,0.999756
3,0.005143,0.000337,0.044004,0.465028,0.041777
4,1.0,0.999982,1.0,1.0,1.0


In [40]:
test_preds_saint = test_preds

In [41]:
test_preds_saint['mean'] = (test_preds_saint['saint_f0'] + test_preds_saint['saint_f1'] + test_preds_saint['saint_f2'] + test_preds_saint['saint_f3'] + test_preds_saint['saint_f4']) / 5

In [42]:
dump(test_preds_saint, predpath/'saint_fixed_testpreds_20211130_with_mean.joblib')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/preds/saint_fixed_testpreds_20211130_with_mean.joblib']

# SUBMISSION

In [43]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [44]:
sample_df.loc[:, 'target'] = test_preds_saint['mean']

In [45]:
sample_df.head()

Unnamed: 0,id,target
0,600000,1.0
1,600001,1.0
2,600002,1.0
3,600003,0.819209
4,600004,1.0


In [46]:
sample_df.iloc[60160:60288, :]

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,id,target
60160,660160,
60161,660161,
60162,660162,
60163,660163,
60164,660164,
...,...,...
60283,660283,
60284,660284,
60285,660285,
60286,660286,


In [1]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [49]:
sample_df.to_csv(subpath/f"saint_fixed_mean_20211130.csv", index=False)
# sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-X_orig+KMeans8+synth-GBM-stack_ensemble_preds.csv", index=False)

In [61]:
# dump(sample_df.iloc[60160:60288, 1], predpath/'gbm_fillins_for_60160-60288.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/preds/gbm_fillins_for_60160-60288.joblib']