# Super-Learner Manual Implementation
Since `mlens.ensemble.SuperLearner` does not appear to be capable of correctly outputting probabilities in binary classification tasks, I'm going to roll my own implementation of the underlying algorithm.

# Setup

In [1]:
# two manual flags (ex-config)
colab = False
gpu_available = False
libraries = ['xgboost', 'lightgbm', 'catboost']

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"super-learner_manual_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if colab:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
    !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

    !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if gpu_available: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if gpu_available:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer
# import timm
import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
# from mlens.ensemble import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


Now, datapath setup

In [6]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [7]:
if colab:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/sep2021/')
    
else:
    # if on local machine
    datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')    
    


## Ex-Model Config

In [8]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
    # model config
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
    "library": libraries,
    "scaler": StandardScaler, # TODO: experiment with others (but imputation may be slow)
    "scale_b4_impute": False,
    "imputer": SimpleImputer(strategy='median', add_indicator=True),
#     "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
#     'subsample': 1,
#     'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
#     'kfolds': 5, # if 1, that means just doing holdout
#     'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [9]:
# if exmodel_config['scaler']:
#     scaler = exmodel_config['scaler']()
#     scaler.fit_transform()

In [10]:
# # here's how to load the original, unaltered dataset and separate features from targets
# df = pd.read_feather(path=datapath/'dataset_df.feather') # this is the unaltered original dataset
# features = [x for x in df.columns if x != 'claim']
# X = df[features]
# y = df.claim



# load the version of the dataset with imputations; X and y were stored separately, as feather and joblib respectively
X = pd.read_feather(datapath/'X_NaNcounts_imputed-Median-wIndicators-StandardScaled.feather') 
y = load(datapath/'y.joblib')    
X.index.name = 'id'
y.index.name = 'id'

exmodel_config['feature_count'] = len(X.columns)
exmodel_config['feature_generator'] = None

In [11]:
# scaler = exmodel_config['scaler']()
# X_scaled = scaler.fit_transform(X)
# X = pd.DataFrame(X_scaled, columns=X.columns)

In [12]:
X.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,227,228,229,230,231,232,233,234,235,236
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,7.821398,-0.12703


In [13]:
y.head()

id
0    1
1    0
2    1
3    1
4    1
Name: claim, dtype: int64

### Model Config

In [14]:
def model_configurator(library, gpu_available=True):#, config=universal_config):
    """
    Function that provide task-specific or general preference arguments for the various models. 
    
    At first, will rely largely on defaults for hyperparameters, but later this function 
    can be supplemented later with optimal values, as they're learned in sweeps.
    .
    
    Rationale: creating a helper function will allow more experimentation later, and also
    composite runs that cycle through a series of models.
    
    :param model: A model from [XGBClassifier, LGBMClassifier, CatBoostClassifier]
    :return config: A dict that supplements default hyperparameter values with 1) 
                    task-appropriate ones, and perhaps later 2) optimal hyperparameter values.
    """
    config = {}
    
    # library-specific config
    if library in ['xgboost', 'lightgbm']:
        config['n_jobs'] = -1
        
    # best params per sweep `icac24c5`, generated from notebook `sweep_20210905.ipynb`
    # runtime per fold should be around 12m 38s
    # should get auc of 0.7434 on the random_state=42 holdout
    # haven't yet tried dart
    if library == 'xgboost':
#         config['tree_method'] = 'auto'
#         config['booster'] = 'gbtree' # or 'dart'
#         config['model'] = XGBClassifier
        config['verbosity'] = 1
        config['objective'] = 'binary:logistic'
#         config['eval_metric'] = ['auc', 'logloss', 'aucpr'],
        config['tree_method'] = 'gpu_hist' if (gpu_available and colab) else 'auto' 
        
        # comment out the below to get defaults
        config['n_estimators'] = 902
        config['learning_rate'] = 0.0304
        config['max_depth'] = 3
        config['reg_alpha'] = 0.863
        config['reg_lambda'] = 2.442
        config['subsample'] = 0.8627

    # best params per sweep `sjghewf0`, generated from notebook `sweep_lightgbm_20210907`
    # run name `sweep_lightgbm_20210907_195641`
    # runtime per fold should be around 39s
    # should get an auc of 0.7435 on random_state=42 holdout
    if library == 'lightgbm':
#         config['model'] = LGBMClassifier
        config['objective'] = 'binary'
        config['eval_metric'] = ['auc', 'logloss']
        config['boosting_type'] = 'gbdt' # or 'dart'
        config['device_type'] = 'cuda' if (gpu_available and colab) else 'cpu' # 'gpu' also possible, 'cpu' is default
        
        # comment out the below for defaults
        config['n_estimators'] = 1286
        config['learning_rate'] = 0.03221
        config['max_depth'] = 2
        config['reg_alpha'] = 0.4687
        config['reg_lambda'] = 0.1763
        config['subsample'] = 0.6621
        

#     if config['model'] == CatBoostClassifier:
    if library == 'catboost':
#         config['model'] = CatBoostClassifier
        config['task_type'] = 'GPU' if gpu_available else 'CPU'
        config['custom_metrics'] = ['Logloss', 'AUC'] # objective (loss fn) must be singular, defaults to Logloss
        config['n_estimators'] = 2000 # logged as "iterations" otherwise

    return config

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [15]:
# wandb config:
config_run = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['super-learner', 'experiment', 'attempt'],
    'notes': "Since `mlens.ensemble.SuperLearner` does not appear to be capable of correctly outputting probabilities in binary classification tasks, I'm going to roll my own implementation of the underlying algorithm.",
}

# Preprocessing
Scaling has already occurred -- used `StandardScaler` as a precursor to using `KNNImputer(n_neighbors=5)`, on the premise that imputation would proceed more quickly if things were already scaled. I may try different permutations of this later: using `IterativeImputer` instead, before or after scaling, potentially with different scalers. 

# Feature Creation and Selection

In [16]:
# load all the polynomialfeatures generated with `PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)`
# X_np = np.load(datapath/'X_poly_unscaled.npy')
# X = pd.DataFrame(X_np)

In [17]:
# X.columns

In [18]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_poly = poly.fit_transform(X)

In [19]:
# X_poly_names = poly.get_feature_names(X.columns)
# # X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [20]:
# checks = [feature in X_poly_names for feature in features]
# checks

In [21]:
# X = pd.DataFrame(X_poly, columns=X_poly_names)

In [22]:
# X = X[features[1:]]

# Training

In [23]:
def super_train(X_train, X_valid, y_train, y_valid, model_config, 
                                              random_state=42,
                                              exmodel_config=exmodel_config, 
                                              config_run=config_run):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param X_train: the training set features
    :param X_valid: the validation set features
    :param y_train: the training set targets
    :param y_valid: the validation set targets
    :param random_staKFold: for reproducibility
    :param exmodel_config: dict containing configuration details including the library 
                            (thus model) used, preprocessing, and cross-validation
    :param model_config: dict containing hyperparameter specifications for the model
    :param config_run: dict containing wandb run configuration (name, etc)
    """
    
    
    wandb.init(
        project="202109_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=exmodel_config)   
        
    if exmodel_config['library'] == 'xgboost':
        model = XGBClassifier(
            tree_method=model_config['tree_method'],
            random_state=random_state,
            n_jobs=model_config['n_jobs'], 
            verbosity=model_config['verbosity'], 
            objective=model_config['objective'],
            # #             eval_metric=model_config['eval_metric'],

            # comment out the below for a fairly default model
#             booster=model_config['booster'],
            max_depth=model_config['max_depth'],
            learning_rate=model_config['learning_rate'], 
            subsample=model_config['subsample'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            n_estimators=model_config['n_estimators'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )


    elif exmodel_config['library'] == 'lightgbm':
        model = LGBMClassifier(
#             boosting_type=model_config['boosting_type'],
#             max_depth=model_config['max_depth']
            # TODO
            random_state=random_state,
            n_jobs=model_config['n_jobs'],
            objective=model_config['objective'],
#             eval_metric=model_config['eval_metric'],
            boosting_type=model_config['boosting_type'],
            device_type=model_config['device_type'],
            
            # comment out the below for a basically default model
            n_estimators=model_config['n_estimators'],
            learning_rate=model_config['learning_rate'],
            max_depth=model_config['max_depth'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            subsample=model_config['subsample'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )
        
    elif exmodel_config['library'] == 'catboost':
        print("CatBoost, therefore no WandB callback.")
        model = CatBoostClassifier(
#             n_estimators=config['n_estimators'],
#             learning_rate=config['learning_rate'],
#             max_depth=config['max_depth'],
            task_type=model_config['task_type'],
    #         n_jobs=config['n_jobs'],
    #         verbosity=config['verbosity'],
    #         subsample=config['subsample'],
            n_estimators=model_config['n_estimators'],
            random_state=random_state,
            # objective='Logloss', # default, accepts only one
#             custom_metrics=model_config['custom_metrics'],
    #         bootstrap_type=config['bootstrap_type'],
    #         device:config['device']
        ) 
        model.fit(X_train, y_train)
        
#     y_train_pred = model.predict(X_train)
    y_train_pred = model.predict_proba(X_train)[:,1]

    train_loss = log_loss(y_train, y_train_pred)
    train_auc = roc_auc_score(y_train, y_train_pred)
    wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

    if exmodel_config['library'] == 'catboost':
        print(model.get_all_params())
        wandb.log(model.get_all_params())
    else:
        wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()
    
    # trying with predict_proba
    y_pred = model.predict_proba(X_valid)[:,1]
#     y_pred = model.predict(X_valid)

    valid_loss = log_loss(y_valid, y_pred)
    valid_auc = roc_auc_score(y_valid, y_pred)
    wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
    print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
#     wandb.finish()   
    return model, y_pred
    

In [24]:
random_state = 42
model_list = []
#     XGBClassifer(
#         verbosity=1,
#         n_jobs=-1,
#         objective='binary:logistic',
#         tree_method= 'gpu_hist' if (gpu_available and colab) else 'auto',
#         n_estimators=902,
#         learning_rate=0.0304,
#         max_depth=3,
#         reg_alpha = 0.863,
#         reg_lambda = 2.442,
#         subsample= 0.8627,
#         random_state=exmodel_config['random_state'],
#     ),
model_config = model_configurator('xgboost')
model_list.append(
    ('xgboost', XGBClassifier(
        tree_method=model_config['tree_method'],
        random_state=random_state,
        n_jobs=model_config['n_jobs'], 
        verbosity=model_config['verbosity'], 
        objective=model_config['objective'],
        # #             eval_metric=model_config['eval_metric'],

        # comment out the below for a fairly default model
#             booster=model_config['booster'],
        max_depth=model_config['max_depth'],
        learning_rate=model_config['learning_rate'], 
        subsample=model_config['subsample'],
        reg_alpha=model_config['reg_alpha'],
        reg_lambda=model_config['reg_lambda'],
        n_estimators=model_config['n_estimators'],
    ))
)

model_config = model_configurator('lightgbm')
model_list.append(
    ('lightgbm', LGBMClassifier(
#         verbose=model_config['verbosity'], 
        random_state=random_state,
        n_jobs=model_config['n_jobs'],
        objective=model_config['objective'],
#             eval_metric=model_config['eval_metric'],
        boosting_type=model_config['boosting_type'],
        device_type=model_config['device_type'],

        # comment out the below for a basically default model
        n_estimators=model_config['n_estimators'],
        learning_rate=model_config['learning_rate'],
        max_depth=model_config['max_depth'],
        reg_alpha=model_config['reg_alpha'],
        reg_lambda=model_config['reg_lambda'],
        subsample=model_config['subsample'],
    ))
)

model_config = model_configurator('catboost', gpu_available=False)
model_list.append(
    ('catboost', CatBoostClassifier(
#             n_estimators=config['n_estimators'],
#             learning_rate=config['learning_rate'],
#             max_depth=config['max_depth'],
        task_type=model_config['task_type'],
#         n_jobs=config['n_jobs'],
#         verbosity=config['verbosity'],
#         subsample=config['subsample'],
        n_estimators=model_config['n_estimators'],
        random_state=random_state,
                # objective='Logloss', # default, accepts only one
    #             custom_metrics=model_config['custom_metrics'],
        #         bootstrap_type=config['bootstrap_type'],
        #         device:config['device']
            ))
)

In [25]:
# TRYING TO REVISE THIS BELOW

# def super_cross_validation(model_list=model_list, X=X, y=y, start_fold=0, exmodel_config=exmodel_config, random_state=42):
#     """
#     Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
#     If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
#     :param kfolds: int specifying number of k-folds to use in cross-validation
#     :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
#     """
# #     if exmodel_config['kfolds'] == 1:
# #         print("Proceeding with holdout")
# #         X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
# #                                                       test_size=exmodel_config['test_size'], 
# #                                                       random_state=random_state,
# #                                                      )
# #         model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
# #                                                     model_config=model_config,
# #                                                     config_run=config_run)
# #         wandb.finish()
        
# #     else:

#     wandb.init(
#         project="202109_Kaggle_tabular_playground",
#         save_code=True,
#         tags=config_run['tags'],
#         name=config_run['name'],
#         notes=config_run['notes'],
#         config=exmodel_config)  
    
#     X, y = X.to_numpy(), y.to_numpy()
#     # from https://machinelearningmastery.com/super-learner-ensemble-in-python/ # Super Learner for Classification
#     X_meta, y_meta = [], [] # for the meta-learner; might concat the X with the data later for a passthrough=True run
#     exmodel_config['kfolds'] = 5
#     kfold = KFold(n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)
#     trained_models = {}
# #         oof_preds = {}
#     model_path = Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/")
#     (model_path).mkdir(exist_ok=True)
#     preds_path = Path(datapath/f"preds/{config_run['name']}_{exmodel_config['kfolds']}folds/")
#     (preds_path).mkdir(exist_ok=True)
#     for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#         if fold < start_fold:
#             continue
#         else:
#             print(f"FOLD {fold}")
#             print("---------------------------------------------------")
#             fold_preds = []
#             X_train, X_valid = X[train_ids], X[valid_ids]
#             y_train, y_valid = y[train_ids], y[valid_ids]
#             y_meta.extend(y_valid) # building this "meta" y as we go - check RAM can manage
#             for model in model_list:
#                 model[1].fit(X_train, y_train)
#                 y_train_pred = model[1].predict_proba(X_train)[:,1]

#                 train_loss = log_loss(y_train, y_train_pred)
#                 train_auc = roc_auc_score(y_train, y_train_pred)
# #                     wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

#                 if model[0] == 'catboost':
# #                         print(model[1].get_all_params())
#                     wandb.log({'catboost_params': str(model[1].get_all_params())})                        
#                 else:
#                     wandb.log({f'{model[0]}_params': str(model[1].get_params())}) 
#                 y_pred = model[1].predict_proba(X_valid)[:,1]
#                 fold_preds.append(y_pred)

#                 valid_loss = log_loss(y_valid, y_pred)
#                 valid_auc = roc_auc_score(y_valid, y_pred)
#                 wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
#                 print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
# #                     oof_preds = super_train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
# # #                                                     model_config=model_config,
# #                                                     config_run=config_run)
#                 wandb.log({'fold': fold})
#                 dump(y_pred, Path(preds_path/f"{model[0]}_fold{fold}_oof-preds.joblib"))

# #                     models[fold] = model
# #                 preds[fold] = oof_preds
#             X_meta.append(np.hstack(fold_preds))
#             dump(fold_preds, Path(preds_path/f"all_fold{fold}_oof-preds.joblib"))
# #                     dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))

# #                 wandb.finish()
# #     return np.vstack(X_meta), np.asarray(y_meta)
#     return X_meta, y_meta
        

In [26]:
# X.shape

In [27]:
# kfold = KFold(n_splits=exmodel_config['kfolds'], shuffle=True, random_state=42)
# X_np, y_np = {}, {}
# X_trains, y_trains = {}, {}
# X_valids, y_valids = {}, {}
# for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#     print(type(train_ids))
#     print(train_ids[:20])
#     print(X.loc[train_ids])
#     print(fold)
#     X_np[fold] = X.to_numpy()
#     y_np[fold] = y.to_numpy()
#     X_trains[fold] = X[train_ids]
#     X_valids[fold] = X[valid_ids]
#     y_trains[fold], y_valids[fold] = y[train_ids], y[valid_ids]
# #     X_trains[fold] = X_np[fold][train_ids]
# #     X_valids[fold] = X_np[fold][valid_ids]
# #     y_trains[fold], y_valids[fold] = y_np[fold][train_ids], y_np[fold][valid_ids]
#     print(f"X_train shape is {X_train.shape}, X_valid shape is {X_valid.shape}")
#     print(f"y_train.shape is {y_train.shape}, y_valid shape is {y_valid.shape}")

In [28]:
# splitted = kfold.split(X,y)

In [29]:
# for fold, (train_ids, valid_ids) in enumerate(splitted):
#     print(f"Fold {fold} has {len(train_ids)} training items and {len(valid_ids)} valid items")

Oddly, it seems that the shapes are different if you you're explicitly 

In [30]:
# X_meta, y_meta = super_cross_validation()
# X_meta_df = pd.DataFrame(X_meta, columns=[str(x) for x in range(X_meta.shape[1])])
# X_meta_df.to_feather((datapath/'X_super-learner_meta.feather'))
# dump(y_meta, datapath/"y_super-learner_meta.joblib")

In [31]:
# test = pd.DataFrame({'col1': [1,2,3,4], 'col2': [11,12,13,14]})

In [32]:
# test1 = pd.DataFrame({'col1': [5,6,7,8], 'col2': [15,16,17,18]})

In [33]:
# test_concat = pd.concat([test, test1], axis=0)

In [34]:
# test_concat

In [35]:
test = pd.DataFrame()

In [36]:
type(test)

pandas.core.frame.DataFrame

In [37]:
test_list = model_list[1:]

In [38]:
test_list

[('lightgbm',
  LGBMClassifier(device_type='cpu', learning_rate=0.03221, max_depth=2,
                 n_estimators=1286, objective='binary', random_state=42,
                 reg_alpha=0.4687, reg_lambda=0.1763, subsample=0.6621)),
 ('catboost', <catboost.core.CatBoostClassifier at 0x7ff3d04dec70>)]

In [39]:
def super_cross_validation(model_list=model_list, 
                           X=X, 
                           y=y, 
                           start_fold=0, 
                           exmodel_config=exmodel_config, 
                           random_state=42, 
                           wandb_run=True, 
                           passthrough=False):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
#     if exmodel_config['kfolds'] == 1:
#         print("Proceeding with holdout")
#         X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                       test_size=exmodel_config['test_size'], 
#                                                       random_state=random_state,
#                                                      )
#         model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
#                                                     model_config=model_config,
#                                                     config_run=config_run)
#         wandb.finish()
        
#     else:

    if wandb_run:
        wandb.init(
            project="202109_Kaggle_tabular_playground",
            save_code=True,
            tags=config_run['tags'],
            name=config_run['name'],
            notes=config_run['notes'],
            config=exmodel_config)  
    
    X_np, y_np = X.to_numpy(), y.to_numpy()
    # from https://machinelearningmastery.com/super-learner-ensemble-in-python/ # Super Learner for Classification
#     X_meta, y_meta = X, y # for the meta-learner; might concat the X with the data later for a passthrough=True run
#     X_baseline_preds = 
    X_meta, y_meta = [], []
    exmodel_config['kfolds'] = 5
    exmodel_config['cross_val_strategy'] = 'KFold'
    kfold = KFold(n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)
#     trained_models = {}
#         oof_preds = {}
#     model_path = Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/")
#     (model_path).mkdir(exist_ok=True)
    preds_path = Path(datapath/f"preds/{config_run['name']}_{exmodel_config['kfolds']}folds/")
    (preds_path).mkdir(exist_ok=True)
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        if fold < start_fold:
            continue
        else:
            fold_preds = [] # initializing the list of predictions for the fold
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            X_np, y_np = X.to_numpy(), y.to_numpy()
            
#             fold_preds = []
            # proeeding with `pd.DataFrame`s rather than `np.ndarray`s
#             X_train, X_valid = X.loc[train_ids], X.loc[valid_ids]
#             y_train, y_valid = y.loc[train_ids], y.loc[valid_ids]
            X_train, X_valid = X_np[train_ids], X_np[valid_ids]
            y_train, y_valid = y_np[train_ids], y_np[valid_ids]
            print(f"type(X_train) = {type(X_train)}")
            print(f"type(y_trian) = {type(y_train)}")
            # append the y_valid to y_meta
#             pd.concat(y_meta, y_valid, axis=0) # building this "meta" y as we go - the valid for each fold go at the bottom of the col/pd.Series
            y_meta.append(y_valid) # assumes y_meta is a list
#             y_meta = pd.concat(y_meta, pd.Series(y_valid))
            for model in model_list: # presumes tuples of the form (str, model)
                model[1].fit(X_train, y_train)
                y_train_pred = model[1].predict_proba(X_train)[:,1]

                train_loss = log_loss(y_train, y_train_pred)
                train_auc = roc_auc_score(y_train, y_train_pred)
                if wandb_run:
                    wandb.log({'train_loss': train_loss, 'train_auc': train_auc})
                    if model[0] == 'catboost':
    #                         print(model[1].get_all_params())
                        wandb.log({'catboost_params': str(model[1].get_all_params())})                        
                    else:
                        wandb.log({f'{model[0]}_params': str(model[1].get_params())}) 
                y_pred = model[1].predict_proba(X_valid)[:,1]
                fold_preds.append(y_pred.reshape(len(y_pred), 1))
                
                valid_loss = log_loss(y_valid, y_pred)
                valid_auc = roc_auc_score(y_valid, y_pred)
                
                print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
#                     oof_preds = super_train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
# #                                                     model_config=model_config,
#                                                     config_run=config_run)
                
                if wandb_run:
                    wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
                    wandb.log({'fold': fold})
                dump(y_pred, Path(preds_path/f"{model[0]}_fold{fold}_oof-preds.joblib"))
                
#                     models[fold] = model
#           y      preds[fold] = oof_preds
            X_meta.append(np.hstack(fold_preds))
            dump(fold_preds, Path(preds_path/f"all_fold{fold}_oof-preds.joblib"))
#                     dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))

#                 wandb.finish()
#     return np.vstack(X_meta), np.asarray(y_meta)
    # at this point, X_meta is a length-5 list of np.ndarrays; each array has as many columns as estimators. The number of rows across folds sum to the number in the dataset.
    # meanwhile, y_meta is a length-5 list of np.ndarrays containing the labels for each fold.
    # on the way out, let's get them into the proper format
#     return X_meta, y_meta 
    return np.vstack(X_meta), np.asarray(y_meta)
        

In [40]:
# this will define the features and labels for the meta-learner
X_meta, y_meta = super_cross_validation(test_list, wandb_run=False)

FOLD 0
---------------------------------------------------
type(X_train) = <class 'numpy.ndarray'>
type(y_trian) = <class 'numpy.ndarray'>
Valid log-loss is 0.5113911181729013
Valid AUC is 0.8114826365905788
Learning rate set to 0.093011
0:	learn: 0.6580718	total: 140ms	remaining: 4m 39s
1:	learn: 0.6298703	total: 228ms	remaining: 3m 47s
2:	learn: 0.6072111	total: 320ms	remaining: 3m 33s
3:	learn: 0.5889982	total: 410ms	remaining: 3m 24s
4:	learn: 0.5743639	total: 510ms	remaining: 3m 23s
5:	learn: 0.5625637	total: 595ms	remaining: 3m 17s
6:	learn: 0.5530566	total: 678ms	remaining: 3m 13s
7:	learn: 0.5454104	total: 773ms	remaining: 3m 12s
8:	learn: 0.5392238	total: 859ms	remaining: 3m 9s
9:	learn: 0.5342211	total: 951ms	remaining: 3m 9s
10:	learn: 0.5301457	total: 1.04s	remaining: 3m 8s
11:	learn: 0.5268757	total: 1.13s	remaining: 3m 7s
12:	learn: 0.5242039	total: 1.21s	remaining: 3m 4s
13:	learn: 0.5220363	total: 1.3s	remaining: 3m 5s
14:	learn: 0.5202507	total: 1.39s	remaining: 3m 3s


In [44]:
X_meta_stacked = np.vstack(X_meta)

In [54]:
y_meta_array = np.asarray(y_meta)

  return array(a, dtype, copy=False, order=order)


In [70]:
y_meta_hstack = np.hstack(y_meta_array,)

In [71]:
y_meta_hstack.shape

(957919,)

In [73]:
y_meta

[array([1, 1, 1, ..., 0, 0, 1]),
 array([1, 0, 0, ..., 0, 1, 1]),
 array([0, 1, 1, ..., 0, 0, 0]),
 array([0, 1, 1, ..., 0, 1, 1]),
 array([1, 0, 0, ..., 0, 1, 1])]

In [72]:
y_meta_hstack

array([1, 1, 1, ..., 0, 1, 1])

In [55]:
meta_learner = LogisticRegression(n_jobs=-1)


In [75]:
np.isnan(y_meta_hstack).any()

False

In [74]:
type(y_meta_hstack)

numpy.ndarray

In [61]:
y_meta_array.shape

(5,)

In [62]:
X_meta_stacked.shape

(957919, 2)

In [76]:
meta_learner.fit(X_meta_stacked, y_meta_hstack)

LogisticRegression(n_jobs=-1)

In [77]:
X_preds = meta_learner.predict_proba(X_meta_stacked)

In [78]:
X_preds # prob of 0, prob of 1

array([[0.45478618, 0.54521382],
       [0.31942728, 0.68057272],
       [0.23111547, 0.76888453],
       ...,
       [0.87283967, 0.12716033],
       [0.22200131, 0.77799869],
       [0.17922862, 0.82077138]])

In [50]:
# for fold, content in enumerate(X_meta):
#     print(f"len(content) = {len(content)}")
#     print(f"For fold {fold}, first ten: {content[:10]}")
#     print(f"Last ten: {content[-10:]}")

len(content) = 574752
For fold 0, first ten: [0.58256644 0.62629855 0.7662285  0.75499147 0.10017349 0.09074276
 0.81610793 0.73974901 0.72552508 0.11378017]
Last ten: [0.71544773 0.12926759 0.77998132 0.73316186 0.5522913  0.14362175
 0.13898191 0.12458763 0.10432058 0.59388915]
len(content) = 574752
For fold 1, first ten: [0.7899453  0.76898772 0.1540149  0.1326734  0.13505521 0.135717
 0.76631808 0.15017609 0.77063125 0.62637734]
Last ten: [0.79133012 0.75843728 0.75821059 0.75214886 0.77777462 0.14167253
 0.12332096 0.11199911 0.77826162 0.59577923]
len(content) = 574752
For fold 2, first ten: [0.14557232 0.13051899 0.731341   0.75136185 0.76965833 0.5596422
 0.15574594 0.58632851 0.73949695 0.1217273 ]
Last ten: [0.12980072 0.13182534 0.7620172  0.47812301 0.61756362 0.13219188
 0.75631119 0.12209702 0.15794384 0.78886702]
len(content) = 574752
For fold 3, first ten: [0.15521842 0.75803405 0.78014207 0.77034795 0.14344598 0.11674909
 0.71550232 0.56999534 0.71767098 0.57802474]
La

In [28]:
X_augment, y_augment = super_cross_validation() # default args; should provide first layer
X_augment_df = pd.DataFrame(X_augment, columns=[str(x) for x in range(X_augment.shape[1])])
X_augment_df.to_feather(datapath/'X_super-learner_augment.feather')
dump(y_augment, datapath/'y_super-learner_augment.joblib')

VBox(children=(Label(value=' 1.72MB of 1.72MB uploaded (0.84MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
xgboost_params,{'objective': 'binar...
_runtime,476
_timestamp,1631498856
_step,2
valid_loss,0.51094
valid_auc,0.81226
fold,0


0,1
_runtime,▁▁▁
_timestamp,▁▁▁
_step,▁▅█
valid_loss,▁
valid_auc,▁
fold,▁


FOLD 0
---------------------------------------------------




Valid log-loss is 0.5109356389636417
Valid AUC is 0.8122584973566012
Valid log-loss is 0.5113911181729013
Valid AUC is 0.8114826365905788
Learning rate set to 0.012572
0:	learn: 0.6881615	total: 12.7ms	remaining: 25.4s
1:	learn: 0.6833118	total: 24.2ms	remaining: 24.2s
2:	learn: 0.6785923	total: 35.9ms	remaining: 23.9s
3:	learn: 0.6740079	total: 47.4ms	remaining: 23.6s
4:	learn: 0.6695553	total: 58.2ms	remaining: 23.2s
5:	learn: 0.6652220	total: 69.9ms	remaining: 23.2s
6:	learn: 0.6610044	total: 81.5ms	remaining: 23.2s
7:	learn: 0.6569123	total: 92.9ms	remaining: 23.1s
8:	learn: 0.6529273	total: 105ms	remaining: 23.3s
9:	learn: 0.6490559	total: 117ms	remaining: 23.2s
10:	learn: 0.6452946	total: 129ms	remaining: 23.3s
11:	learn: 0.6416326	total: 140ms	remaining: 23.2s
12:	learn: 0.6380744	total: 152ms	remaining: 23.3s
13:	learn: 0.6346183	total: 164ms	remaining: 23.2s
14:	learn: 0.6312602	total: 175ms	remaining: 23.2s
15:	learn: 0.6279932	total: 188ms	remaining: 23.3s
16:	learn: 0.62481



Valid log-loss is 0.5084587895255492
Valid AUC is 0.8140504514066044
Valid log-loss is 0.5089415144259191
Valid AUC is 0.8132048676403175
Learning rate set to 0.012572
0:	learn: 0.6881812	total: 11.9ms	remaining: 23.7s
1:	learn: 0.6833507	total: 23.6ms	remaining: 23.6s
2:	learn: 0.6786519	total: 35.8ms	remaining: 23.8s
3:	learn: 0.6740867	total: 47.8ms	remaining: 23.8s
4:	learn: 0.6696530	total: 58.4ms	remaining: 23.3s
5:	learn: 0.6653402	total: 70ms	remaining: 23.2s
6:	learn: 0.6611427	total: 81.7ms	remaining: 23.3s
7:	learn: 0.6570650	total: 92.8ms	remaining: 23.1s
8:	learn: 0.6530955	total: 105ms	remaining: 23.2s
9:	learn: 0.6492365	total: 116ms	remaining: 23.1s
10:	learn: 0.6454903	total: 128ms	remaining: 23.2s
11:	learn: 0.6418443	total: 139ms	remaining: 23s
12:	learn: 0.6382974	total: 150ms	remaining: 23s
13:	learn: 0.6348558	total: 160ms	remaining: 22.8s
14:	learn: 0.6315114	total: 171ms	remaining: 22.6s
15:	learn: 0.6282562	total: 182ms	remaining: 22.6s
16:	learn: 0.6250947	tot



Valid log-loss is 0.5097983286020547
Valid AUC is 0.8128374867961028
Valid log-loss is 0.5102146583888731
Valid AUC is 0.812064302187439
Learning rate set to 0.012572
0:	learn: 0.6881698	total: 11.9ms	remaining: 23.8s
1:	learn: 0.6833288	total: 23.6ms	remaining: 23.6s
2:	learn: 0.6786181	total: 35.8ms	remaining: 23.8s
3:	learn: 0.6740427	total: 48.8ms	remaining: 24.4s
4:	learn: 0.6695985	total: 59.8ms	remaining: 23.8s
5:	learn: 0.6652731	total: 71.3ms	remaining: 23.7s
6:	learn: 0.6610628	total: 83.1ms	remaining: 23.6s
7:	learn: 0.6569759	total: 94.5ms	remaining: 23.5s
8:	learn: 0.6529996	total: 106ms	remaining: 23.4s
9:	learn: 0.6491372	total: 117ms	remaining: 23.3s
10:	learn: 0.6453828	total: 128ms	remaining: 23.1s
11:	learn: 0.6417263	total: 138ms	remaining: 22.9s
12:	learn: 0.6381747	total: 150ms	remaining: 23s
13:	learn: 0.6347257	total: 162ms	remaining: 22.9s
14:	learn: 0.6313739	total: 172ms	remaining: 22.8s
15:	learn: 0.6281130	total: 185ms	remaining: 22.9s
16:	learn: 0.6249443	



Valid log-loss is 0.508592728157079
Valid AUC is 0.8134012906552728
Valid log-loss is 0.5090037922697467
Valid AUC is 0.812786647498434
Learning rate set to 0.012572
0:	learn: 0.6881789	total: 12ms	remaining: 23.9s
1:	learn: 0.6833460	total: 23.6ms	remaining: 23.6s
2:	learn: 0.6786440	total: 35.5ms	remaining: 23.6s
3:	learn: 0.6740763	total: 48.2ms	remaining: 24.1s
4:	learn: 0.6696408	total: 59.1ms	remaining: 23.6s
5:	learn: 0.6653244	total: 70.5ms	remaining: 23.4s
6:	learn: 0.6611224	total: 82.6ms	remaining: 23.5s
7:	learn: 0.6570436	total: 93.9ms	remaining: 23.4s
8:	learn: 0.6530729	total: 106ms	remaining: 23.6s
9:	learn: 0.6492123	total: 118ms	remaining: 23.5s
10:	learn: 0.6454648	total: 129ms	remaining: 23.4s
11:	learn: 0.6418183	total: 141ms	remaining: 23.3s
12:	learn: 0.6382725	total: 153ms	remaining: 23.4s
13:	learn: 0.6348293	total: 164ms	remaining: 23.3s
14:	learn: 0.6314833	total: 175ms	remaining: 23.2s
15:	learn: 0.6282277	total: 188ms	remaining: 23.3s
16:	learn: 0.6250655	t



Valid log-loss is 0.5078269726162877
Valid AUC is 0.8138386718432461
Valid log-loss is 0.5082532843485519
Valid AUC is 0.8131253951692738
Learning rate set to 0.012572
0:	learn: 0.6881873	total: 11.7ms	remaining: 23.4s
1:	learn: 0.6833640	total: 23.3ms	remaining: 23.2s
2:	learn: 0.6786687	total: 35.1ms	remaining: 23.3s
3:	learn: 0.6741090	total: 46.4ms	remaining: 23.1s
4:	learn: 0.6696810	total: 57ms	remaining: 22.8s
5:	learn: 0.6653696	total: 68.6ms	remaining: 22.8s
6:	learn: 0.6611757	total: 80.2ms	remaining: 22.8s
7:	learn: 0.6571039	total: 91.5ms	remaining: 22.8s
8:	learn: 0.6531399	total: 103ms	remaining: 22.9s
9:	learn: 0.6492858	total: 115ms	remaining: 22.8s
10:	learn: 0.6455439	total: 127ms	remaining: 22.9s
11:	learn: 0.6419007	total: 138ms	remaining: 22.9s
12:	learn: 0.6383616	total: 150ms	remaining: 23s
13:	learn: 0.6349245	total: 162ms	remaining: 22.9s
14:	learn: 0.6315831	total: 173ms	remaining: 22.9s
15:	learn: 0.6283333	total: 185ms	remaining: 22.9s
16:	learn: 0.6251756	t

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 574752 and the array at index 4 has size 574749

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
meta_model = LogisticRegression(n_jobs=-1)
meta_model.fit(X_augment, y_augment)
dump(meta_model, Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/meta-model_LogisticRegression.joblib"))

In [None]:
# def super_fit_fullset(X,y,model_list):
#     for model in model_list:
#         model[1].fit(X,y)

In [None]:
# def super_learner_predictor(X, model_list, meta_model):
#     X_meta = []
#     for model in model_list:
#         y_pred = model[1].predict_proba(X)
#         X_meta.append(y_pred)
#     X_meta = np.hstack(X_meta)
#     return meta_model.predict(X_meta)

In [31]:
# # might encapsulate this in a new version of the above train function later
# exmodel_config['ensemble'] = 'mlens.ensemble.SuperLearner'

# wandb.init(
#         project="202109_Kaggle_tabular_playground",
#         save_code=True,
#         tags=config_run['tags'],
#         name=config_run['name'],
#         notes=config_run['notes'],
#         config=exmodel_config)   

# random_state = exmodel_config['random_state'] # 42

# model_config = model_configurator('xgboost')
# xgboost_model = XGBClassifier(
#             tree_method=model_config['tree_method'],
#             random_state=random_state,
# #             n_jobs=model_config['n_jobs'], 
#             verbosity=model_config['verbosity'], 
#             objective=model_config['objective'],
#             # #             eval_metric=model_config['eval_metric'],

#             # comment out the below for a fairly default model
# #             booster=model_config['booster'],
#             max_depth=model_config['max_depth'],
#             learning_rate=model_config['learning_rate'], 
#             subsample=model_config['subsample'],
#             reg_alpha=model_config['reg_alpha'],
#             reg_lambda=model_config['reg_lambda'],
#             n_estimators=model_config['n_estimators'],
#         )

# model_config = model_configurator('lightgbm')
# lightgbm_model = LGBMClassifier(
#             random_state=random_state,
# #             n_jobs=model_config['n_jobs'],
#             objective=model_config['objective'],
#             boosting_type=model_config['boosting_type'],
#             device_type=model_config['device_type'],
            
#             # comment out the below for a basically default model
#             n_estimators=model_config['n_estimators'],
#             learning_rate=model_config['learning_rate'],
#             max_depth=model_config['max_depth'],
#             reg_alpha=model_config['reg_alpha'],
#             reg_lambda=model_config['reg_lambda'],
#             subsample=model_config['subsample'],
#         )

# model_config = model_configurator('catboost', gpu_available=False) # set GPU false to avoid parallel threads blocking GPU
# catboost_model = CatBoostClassifier(
#             task_type=model_config['task_type'],
#             n_estimators=model_config['n_estimators'],
#             random_state=random_state,
#         ) 

# blender = SuperLearner(folds=5, 
#                              shuffle=False, 
#                              random_state=random_state,
#                              n_jobs=3,
#                              verbose=1,
# #                              scorer=roc_auc_score,
#                             )

# estimators_list = [
#     ('xgboost', xgboost_model),
#     ('lightgbm', lightgbm_model),
#     ('catboost', catboost_model)
# ]

# blender.add(estimators_list, 
#                   proba=True, # ensures use of predict_proba
#                   propagate_features=False # seemed to work better in stacking
#                  )

# blender.add_meta(LogisticRegression()) # a simple linear model should work best

# # wandb.log({'estimators': estimators_list})


# # blender = StackingClassifier(estimators=estimators_list,
# # #                              final_estimator=XGBRegressor(),
# #                              cv=5,
# #                              stack_method='predict_proba',
# #                              n_jobs=3,
# #                              passthrough=True,
# #                              verbose=1
# #                             )
# exmodel_config['blender-passthrough'] = False
# # exmodel_config['blender_final_estimator'] = 

           
    

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


In [32]:
# wandb.log({'blender-final_estimator': LogisticRegression,
# #            'blender-final_estimator_params': str(blender.final_estimator.get_params()),
#            'blender-stack_mdethod': 'predict_proba',
#            'blender-cv': 5
#           })

In [33]:
# print(f"Starting fitting at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
# blender.fit(X,y) # unsure of this -- given kwarg cv=5, is it producing the splits? Or do I have to somehow?
# print(f"Fitting complete at {datetime.now().strftime('%Y%m%d_%H%M%S')}")

Starting fitting at 20210911_204425

Fitting 2 layers
Learning rate set to 0.10231


Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.6549103	total: 770ms	remaining: 25m 39s
Learning rate set to 0.093011
Learning rate set to 0.093011
1:	learn: 0.6248772	total: 1.29s	remaining: 21m 33s
0:	learn: 0.6582614	total: 248ms	remaining: 8m 16s
2:	learn: 0.6012832	total: 1.64s	remaining: 18m 12s
1:	learn: 0.6302052	total: 509ms	remaining: 8m 28s
0:	learn: 0.6581684	total: 339ms	remaining: 11m 16s
2:	learn: 0.6076621	total: 790ms	remaining: 8m 46s
3:	learn: 0.5827574	total: 1.96s	remaining: 16m 19s
1:	learn: 0.6300503	total: 745ms	remaining: 12m 23s
3:	learn: 0.5895252	total: 1.08s	remaining: 9m 1s
4:	learn: 0.5681846	total: 2.34s	remaining: 15m 32s
2:	learn: 0.6074422	total: 1.09s	remaining: 12m 5s
4:	learn: 0.5749554	total: 1.37s	remaining: 9m 5s
5:	learn: 0.5567044	total: 2.67s	remaining: 14m 46s
3:	learn: 0.5892795	total: 1.35s	remaining: 11m 16s
5:	learn: 0.5632307	total: 1.62s	remaining: 8m 58s
6:	learn: 0.5476754	total: 2.98s	remaining: 14m 9s
4:	learn: 0.5746635	total: 1.68s	remaining: 11m 10s
6:	learn: 0.55



Fit complete                        | 01:36:38
Fitting complete at 20210911_222104


In [34]:
# wandb.log({'xgboost_params':str(xgboost_model.get_params()),
#            'lightgbm_params':str(lightgbm_model.get_params()),
# #            'catboost_params':str(blender.estimators[2][1].get_all_params()),
#           })

In [35]:
# model_path = Path(datapath/f"models/{config_run['name']}/")
# (model_path).mkdir(exist_ok=True)
# dump(blender, filename=model_path/f"{config_run['name']}_super-learner.joblib")
# print(f"Blender model saved at {datetime.now().strftime('%Y%m%d_%H%M%S')}")

Blender model saved at 20210911_222109


In [37]:
# train_preds = blender.predict_proba(X)#[:,1]
# train_loss = log_loss(y_pred=train_preds, y_true=y)
# train_auc = roc_auc_score(y, train_preds)
# wandb.log({'train_loss': train_loss, 'train_auc': train_auc})
# print(f"train_loss is {train_loss}, train_auc is {train_auc}")


Predicting 2 layers




Predict complete                    | 00:00:04


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


train_loss is nan, train_auc is 0.7818530616374615


In [None]:
# train_preds[:20]

# Test set preprocessing


(Here's where encapsulating the transformations in a pipeline would come in handy. But I'll do it manually for now.)

In [None]:
# features = [x for x in test_df.columns if x != 'claim']
# X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [None]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_test_poly = poly.fit_transform(X_test)

In [None]:
# X_test_poly_names = poly.get_feature_names(X_test.columns)
# X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [None]:
# checks = [feature in X_test_poly_names for feature in features]
# checks

In [None]:
# X_test_final = pd.DataFrame(X_test_poly, columns=X_test_poly_names)

In [None]:
# X_test_final = X_test_final[features[1:]]
# X_test_final = X_test

In [None]:
# X_test['nan_count'] = X_test.isnull().sum(axis=1)

In [None]:
# imputer = SimpleImputer(strategy='median', add_indicator=True)
# X_test_imputed_np = imputer.fit_transform(X_test)

In [None]:
# X_test_imputed = pd.DataFrame(X_test_imputed, columns=[str(x) for x in range(X_test_imputed.shape[1])])
# X_test_imputed.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators.feather')

In [None]:
# scaler = exmodel_config['scaler']()
# X_test_imputed_scaled_np = scaler.fit_transform(X_test_imputed)
# X_test_imputed_scaled = pd.DataFrame(X_test_imputed_scaled_np, columns=X_test_imputed.columns)
# X_test_imputed_scaled.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [39]:
X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')

## Prediction Generation

In [41]:
# preds_path = Path(datapath/"preds/")

# blender_preds = blender.predict_proba(X_test_imputed_scaled)#[:,1]
# dump(blender_preds, preds_path/f"{config_run['name']}_stack.joblib")


Predicting 2 layers




Predict complete                    | 00:00:02


['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/super-learner_off-shelf_20210911_204351_stack.joblib']

# Submission

In [42]:
sample_df = pd.read_csv(datapath/'sample_solution.csv')

In [43]:
sample_df.loc[:, 'claim'] = blender_preds

In [44]:
sample_df.head()

Unnamed: 0,id,claim
0,957919,1.0
1,957920,0.0
2,957921,1.0
3,957922,0.0
4,957923,0.0


In [45]:
submission_path = datapath/'submissions'
submission_path.mkdir(exist_ok=True)

In [46]:
sample_df.to_csv(submission_path/f"{config_run['name']}_blended.csv", index=False)

In [56]:
wandb.log({'leaderboard_auc':  })

In [57]:
wandb.finish()

VBox(children=(Label(value=' 0.27MB of 0.27MB uploaded (0.08MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
blender-final_estimator,
blender-stack_mdethod,predict_proba
blender-cv,5
_runtime,7379
_timestamp,1631403974
_step,3
xgboost_params,{'objective': 'binar...
lightgbm_params,{'boosting_type': 'g...
train_loss,0.49101
train_auc,0.84658


0,1
blender-cv,▁
_runtime,▁▇▇█
_timestamp,▁▇▇█
_step,▁▃▆█
train_loss,▁
train_auc,▁
leaderboard_auc,▁


## Manual Stacking

In [73]:
X.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,227,228,229,230,231,232,233,234,235,236
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,7.821398,-0.12703


In [119]:
X1 = X.copy()

In [120]:
X1.shape

(957919, 237)

In [121]:
# generate probability predictions for the XGBoost model's folds
for fold in xgboost_models.keys():
#     X1[f"xgboost_fold{fold}_pred"] = xgboost_models[fold].predict(X)
    X1[f"xgboost_fold{fold}_pred"] = xgboost_models[fold].predict_proba(X)[:,1]
#     xgboost_preds[fold] = xgboost_models[fold].predict(X_test_imputed_scaled)



In [122]:
X1.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,232,233,234,235,236,xgboost_fold0_pred,xgboost_fold1_pred,xgboost_fold2_pred,xgboost_fold3_pred,xgboost_fold4_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.582566,0.58095,0.576743,0.569523,0.595877
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.152252,0.150803,0.148316,0.155218,0.147297
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.794083,0.789945,0.788326,0.787177,0.797979
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.774001,0.76851,0.774555,0.782187,0.773245
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.127985,-0.128494,-0.12862,7.821398,-0.12703,0.759366,0.755764,0.763769,0.758034,0.758038
