# Super-Learner Manual Implementation
Since `mlens.ensemble.SuperLearner` does not appear to be capable of correctly outputting probabilities in binary classification tasks, I'm going to roll my own implementation of the underlying algorithm.

# Setup

In [1]:
# two manual flags (ex-config)
colab = False
gpu_available = False
libraries = ['xgboost', 'lightgbm', 'catboost']

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"super-learner_manual_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if colab:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
    !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

    !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if gpu_available: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if gpu_available:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer
# import timm
import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
# from mlens.ensemble import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


Now, datapath setup

In [6]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [7]:
if colab:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/sep2021/')
    
else:
    # if on local machine
    datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')    
    


## Ex-Model Config

In [8]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
    # model config
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
    "library": libraries,
    "scaler": StandardScaler, # TODO: experiment with others (but imputation may be slow)
    "scale_b4_impute": False,
    "imputer": SimpleImputer(strategy='median', add_indicator=True),
#     "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
#     'subsample': 1,
#     'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
#     'kfolds': 5, # if 1, that means just doing holdout
#     'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [9]:
# if exmodel_config['scaler']:
#     scaler = exmodel_config['scaler']()
#     scaler.fit_transform()

In [10]:
# # here's how to load the original, unaltered dataset and separate features from targets
# df = pd.read_feather(path=datapath/'dataset_df.feather') # this is the unaltered original dataset
# features = [x for x in df.columns if x != 'claim']
# X = df[features]
# y = df.claim



# load the version of the dataset with imputations; X and y were stored separately, as feather and joblib respectively
X = pd.read_feather(datapath/'X_NaNcounts_imputed-Median-wIndicators-StandardScaled.feather') 
y = load(datapath/'y.joblib')    
X.index.name = 'id'
y.index.name = 'id'

exmodel_config['feature_count'] = len(X.columns)
exmodel_config['feature_generator'] = None

In [11]:
# scaler = exmodel_config['scaler']()
# X_scaled = scaler.fit_transform(X)
# X = pd.DataFrame(X_scaled, columns=X.columns)

In [12]:
X.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,227,228,229,230,231,232,233,234,235,236
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,7.821398,-0.12703


In [13]:
y.head()

id
0    1
1    0
2    1
3    1
4    1
Name: claim, dtype: int64

### Model Config

In [14]:
def model_configurator(library, gpu_available=True):#, config=universal_config):
    """
    Function that provide task-specific or general preference arguments for the various models. 
    
    At first, will rely largely on defaults for hyperparameters, but later this function 
    can be supplemented later with optimal values, as they're learned in sweeps.
    .
    
    Rationale: creating a helper function will allow more experimentation later, and also
    composite runs that cycle through a series of models.
    
    :param model: A model from [XGBClassifier, LGBMClassifier, CatBoostClassifier]
    :return config: A dict that supplements default hyperparameter values with 1) 
                    task-appropriate ones, and perhaps later 2) optimal hyperparameter values.
    """
    config = {}
    
    # library-specific config
    if library in ['xgboost', 'lightgbm']:
        config['n_jobs'] = -1
        
    # best params per sweep `icac24c5`, generated from notebook `sweep_20210905.ipynb`
    # runtime per fold should be around 12m 38s
    # should get auc of 0.7434 on the random_state=42 holdout
    # haven't yet tried dart
    if library == 'xgboost':
#         config['tree_method'] = 'auto'
#         config['booster'] = 'gbtree' # or 'dart'
#         config['model'] = XGBClassifier
        config['verbosity'] = 1
        config['objective'] = 'binary:logistic'
#         config['eval_metric'] = ['auc', 'logloss', 'aucpr'],
        config['tree_method'] = 'gpu_hist' if (gpu_available and colab) else 'auto' 
        
        # comment out the below to get defaults
        config['n_estimators'] = 902
        config['learning_rate'] = 0.0304
        config['max_depth'] = 3
        config['reg_alpha'] = 0.863
        config['reg_lambda'] = 2.442
        config['subsample'] = 0.8627

    # best params per sweep `sjghewf0`, generated from notebook `sweep_lightgbm_20210907`
    # run name `sweep_lightgbm_20210907_195641`
    # runtime per fold should be around 39s
    # should get an auc of 0.7435 on random_state=42 holdout
    if library == 'lightgbm':
#         config['model'] = LGBMClassifier
        config['objective'] = 'binary'
        config['eval_metric'] = ['auc', 'logloss']
        config['boosting_type'] = 'gbdt' # or 'dart'
        config['device_type'] = 'cuda' if (gpu_available and colab) else 'cpu' # 'gpu' also possible, 'cpu' is default
        
        # comment out the below for defaults
        config['n_estimators'] = 1286
        config['learning_rate'] = 0.03221
        config['max_depth'] = 2
        config['reg_alpha'] = 0.4687
        config['reg_lambda'] = 0.1763
        config['subsample'] = 0.6621
        

#     if config['model'] == CatBoostClassifier:
    if library == 'catboost':
#         config['model'] = CatBoostClassifier
        config['task_type'] = 'GPU' if gpu_available else 'CPU'
        config['custom_metrics'] = ['Logloss', 'AUC'] # objective (loss fn) must be singular, defaults to Logloss
        config['n_estimators'] = 2000 # logged as "iterations" otherwise

    return config

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [15]:
# wandb config:
config_run = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['super-learner', 'experiment', 'attempt'],
    'notes': "Since `mlens.ensemble.SuperLearner` does not appear to be capable of correctly outputting probabilities in binary classification tasks, I'm going to roll my own implementation of the underlying algorithm.",
}

# Preprocessing
Scaling has already occurred -- used `StandardScaler` as a precursor to using `KNNImputer(n_neighbors=5)`, on the premise that imputation would proceed more quickly if things were already scaled. I may try different permutations of this later: using `IterativeImputer` instead, before or after scaling, potentially with different scalers. 

# Feature Creation and Selection

In [16]:
# load all the polynomialfeatures generated with `PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)`
# X_np = np.load(datapath/'X_poly_unscaled.npy')
# X = pd.DataFrame(X_np)

In [17]:
# X.columns

In [18]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_poly = poly.fit_transform(X)

In [19]:
# X_poly_names = poly.get_feature_names(X.columns)
# # X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [20]:
# checks = [feature in X_poly_names for feature in features]
# checks

In [21]:
# X = pd.DataFrame(X_poly, columns=X_poly_names)

In [22]:
# X = X[features[1:]]

# Training

In [23]:
def super_train(X_train, X_valid, y_train, y_valid, model_config, 
                                              random_state=42,
                                              exmodel_config=exmodel_config, 
                                              config_run=config_run):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param X_train: the training set features
    :param X_valid: the validation set features
    :param y_train: the training set targets
    :param y_valid: the validation set targets
    :param random_staKFold: for reproducibility
    :param exmodel_config: dict containing configuration details including the library 
                            (thus model) used, preprocessing, and cross-validation
    :param model_config: dict containing hyperparameter specifications for the model
    :param config_run: dict containing wandb run configuration (name, etc)
    """
    
    
    wandb.init(
        project="202109_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=exmodel_config)   
        
    if exmodel_config['library'] == 'xgboost':
        model = XGBClassifier(
            tree_method=model_config['tree_method'],
            random_state=random_state,
            n_jobs=model_config['n_jobs'], 
            verbosity=model_config['verbosity'], 
            objective=model_config['objective'],
            # #             eval_metric=model_config['eval_metric'],

            # comment out the below for a fairly default model
#             booster=model_config['booster'],
            max_depth=model_config['max_depth'],
            learning_rate=model_config['learning_rate'], 
            subsample=model_config['subsample'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            n_estimators=model_config['n_estimators'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )


    elif exmodel_config['library'] == 'lightgbm':
        model = LGBMClassifier(
#             boosting_type=model_config['boosting_type'],
#             max_depth=model_config['max_depth']
            # TODO
            random_state=random_state,
            n_jobs=model_config['n_jobs'],
            objective=model_config['objective'],
#             eval_metric=model_config['eval_metric'],
            boosting_type=model_config['boosting_type'],
            device_type=model_config['device_type'],
            
            # comment out the below for a basically default model
            n_estimators=model_config['n_estimators'],
            learning_rate=model_config['learning_rate'],
            max_depth=model_config['max_depth'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            subsample=model_config['subsample'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )
        
    elif exmodel_config['library'] == 'catboost':
        print("CatBoost, therefore no WandB callback.")
        model = CatBoostClassifier(
#             n_estimators=config['n_estimators'],
#             learning_rate=config['learning_rate'],
#             max_depth=config['max_depth'],
            task_type=model_config['task_type'],
    #         n_jobs=config['n_jobs'],
    #         verbosity=config['verbosity'],
    #         subsample=config['subsample'],
            n_estimators=model_config['n_estimators'],
            random_state=random_state,
            # objective='Logloss', # default, accepts only one
#             custom_metrics=model_config['custom_metrics'],
    #         bootstrap_type=config['bootstrap_type'],
    #         device:config['device']
        ) 
        model.fit(X_train, y_train)
        
#     y_train_pred = model.predict(X_train)
    y_train_pred = model.predict_proba(X_train)[:,1]

    train_loss = log_loss(y_train, y_train_pred)
    train_auc = roc_auc_score(y_train, y_train_pred)
    wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

    if exmodel_config['library'] == 'catboost':
        print(model.get_all_params())
        wandb.log(model.get_all_params())
    else:
        wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()
    
    # trying with predict_proba
    y_pred = model.predict_proba(X_valid)[:,1]
#     y_pred = model.predict(X_valid)

    valid_loss = log_loss(y_valid, y_pred)
    valid_auc = roc_auc_score(y_valid, y_pred)
    wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
    print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
#     wandb.finish()   
    return model, y_pred
    

In [24]:
random_state = 42
model_list = []
#     XGBClassifer(
#         verbosity=1,
#         n_jobs=-1,
#         objective='binary:logistic',
#         tree_method= 'gpu_hist' if (gpu_available and colab) else 'auto',
#         n_estimators=902,
#         learning_rate=0.0304,
#         max_depth=3,
#         reg_alpha = 0.863,
#         reg_lambda = 2.442,
#         subsample= 0.8627,
#         random_state=exmodel_config['random_state'],
#     ),
model_config = model_configurator('xgboost')
model_list.append(
    ('xgboost', XGBClassifier(
        tree_method=model_config['tree_method'],
        random_state=random_state,
        n_jobs=model_config['n_jobs'], 
        verbosity=model_config['verbosity'], 
        objective=model_config['objective'],
        # #             eval_metric=model_config['eval_metric'],

        # comment out the below for a fairly default model
#             booster=model_config['booster'],
        max_depth=model_config['max_depth'],
        learning_rate=model_config['learning_rate'], 
        subsample=model_config['subsample'],
        reg_alpha=model_config['reg_alpha'],
        reg_lambda=model_config['reg_lambda'],
        n_estimators=model_config['n_estimators'],
    ))
)

model_config = model_configurator('lightgbm')
model_list.append(
    ('lightgbm', LGBMClassifier(
#         verbose=model_config['verbosity'], 
        random_state=random_state,
        n_jobs=model_config['n_jobs'],
        objective=model_config['objective'],
#             eval_metric=model_config['eval_metric'],
        boosting_type=model_config['boosting_type'],
        device_type=model_config['device_type'],

        # comment out the below for a basically default model
        n_estimators=model_config['n_estimators'],
        learning_rate=model_config['learning_rate'],
        max_depth=model_config['max_depth'],
        reg_alpha=model_config['reg_alpha'],
        reg_lambda=model_config['reg_lambda'],
        subsample=model_config['subsample'],
    ))
)

model_config = model_configurator('catboost', gpu_available=False)
model_list.append(
    ('catboost', CatBoostClassifier(
#             n_estimators=config['n_estimators'],
#             learning_rate=config['learning_rate'],
#             max_depth=config['max_depth'],
        task_type=model_config['task_type'],
#         n_jobs=config['n_jobs'],
#         verbosity=config['verbosity'],
#         subsample=config['subsample'],
        n_estimators=model_config['n_estimators'],
        random_state=random_state,
                # objective='Logloss', # default, accepts only one
    #             custom_metrics=model_config['custom_metrics'],
        #         bootstrap_type=config['bootstrap_type'],
        #         device:config['device']
            ))
)

In [25]:
test_list = model_list[1:]

In [26]:
test_list

[('lightgbm',
  LGBMClassifier(device_type='cpu', learning_rate=0.03221, max_depth=2,
                 n_estimators=1286, objective='binary', random_state=42,
                 reg_alpha=0.4687, reg_lambda=0.1763, subsample=0.6621)),
 ('catboost', <catboost.core.CatBoostClassifier at 0x7f336c156640>)]

In [27]:
def super_train(model_list=model_list, 
                           X=X, 
                           y=y, 
                           start_fold=0, 
                           exmodel_config=exmodel_config, 
                           random_state=42, 
                           wandb_run=True, 
                           passthrough=False):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
#     if exmodel_config['kfolds'] == 1:
#         print("Proceeding with holdout")
#         X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                       test_size=exmodel_config['test_size'], 
#                                                       random_state=random_state,
#                                                      )
#         model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
#                                                     model_config=model_config,
#                                                     config_run=config_run)
#         wandb.finish()
        
#     else:

    if wandb_run:
        wandb.init(
            project="202109_Kaggle_tabular_playground",
            save_code=True,
            tags=config_run['tags'],
            name=config_run['name'],
            notes=config_run['notes'],
            config=exmodel_config)  
    
    X_np, y_np = X.to_numpy(), y.to_numpy()
    # from https://machinelearningmastery.com/super-learner-ensemble-in-python/ # Super Learner for Classification
#     X_meta, y_meta = X, y # for the meta-learner; might concat the X with the data later for a passthrough=True run
#     X_baseline_preds = 
    X_meta, y_meta = [], []
    exmodel_config['kfolds'] = 5
    exmodel_config['cross_val_strategy'] = 'KFold'
    kfold = KFold(n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)
#     trained_models = {}
#         oof_preds = {}
#     model_path = Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/")
#     (model_path).mkdir(exist_ok=True)
    preds_path = Path(datapath/f"preds/{config_run['name']}_{exmodel_config['kfolds']}folds/")
    (preds_path).mkdir(exist_ok=True)
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        if fold < start_fold:
            continue
        else:
            fold_preds = [] # initializing the list of predictions for the fold
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            X_np, y_np = X.to_numpy(), y.to_numpy()
            
#             fold_preds = []
            # proeeding with `pd.DataFrame`s rather than `np.ndarray`s
#             X_train, X_valid = X.loc[train_ids], X.loc[valid_ids]
#             y_train, y_valid = y.loc[train_ids], y.loc[valid_ids]
            X_train, X_valid = X_np[train_ids], X_np[valid_ids]
            y_train, y_valid = y_np[train_ids], y_np[valid_ids]
            print(f"type(X_train) = {type(X_train)}")
            print(f"type(y_trian) = {type(y_train)}")
            # append the y_valid to y_meta
#             pd.concat(y_meta, y_valid, axis=0) # building this "meta" y as we go - the valid for each fold go at the bottom of the col/pd.Series
            y_meta.append(y_valid) # assumes y_meta is a list
#             y_meta = pd.concat(y_meta, pd.Series(y_valid))
            for model in model_list: # presumes tuples of the form (str, model)
                model[1].fit(X_train, y_train)
                y_train_pred = model[1].predict_proba(X_train)[:,1]

                train_loss = log_loss(y_train, y_train_pred)
                train_auc = roc_auc_score(y_train, y_train_pred)
                if wandb_run:
                    wandb.log({'train_loss': train_loss, 'train_auc': train_auc})
                    if model[0] == 'catboost':
    #                         print(model[1].get_all_params())
                        wandb.log({'catboost_params': str(model[1].get_all_params())})                        
                    else:
                        wandb.log({f'{model[0]}_params': str(model[1].get_params())}) 
                y_pred = model[1].predict_proba(X_valid)[:,1]
                fold_preds.append(y_pred.reshape(len(y_pred), 1))
                
                valid_loss = log_loss(y_valid, y_pred)
                valid_auc = roc_auc_score(y_valid, y_pred)
                
                print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
#                     oof_preds = super_train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
# #                                                     model_config=model_config,
#                                                     config_run=config_run)
                
                if wandb_run:
                    wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
                    wandb.log({'fold': fold})
                dump(y_pred, Path(preds_path/f"{model[0]}_fold{fold}_oof-preds.joblib"))
                
#                     models[fold] = model
#           y      preds[fold] = oof_preds
            X_meta.append(np.hstack(fold_preds))
            dump(fold_preds, Path(preds_path/f"all_fold{fold}_oof-preds.joblib"))
#                     dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))

#                 wandb.finish()
#     return np.vstack(X_meta), np.asarray(y_meta)
    # at this point, X_meta is a length-5 list of np.ndarrays; each array has as many columns as estimators. The number of rows across folds sum to the number in the dataset.
    # meanwhile, y_meta is a length-5 list of np.ndarrays containing the labels for each fold.
    # on the way out, let's get them into the proper format
#     return X_meta, y_meta 
    return np.vstack(X_meta), np.hstack(np.asarray(y_meta))
        

In [28]:
# this will define the features and labels for the meta-learner
X_meta, y_meta = super_train(model_list, wandb_run=True)

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


FOLD 0
---------------------------------------------------
type(X_train) = <class 'numpy.ndarray'>
type(y_trian) = <class 'numpy.ndarray'>




Valid log-loss is 0.5109356389636417
Valid AUC is 0.8122584973566012
Valid log-loss is 0.5113911181729013
Valid AUC is 0.8114826365905788
Learning rate set to 0.093011
0:	learn: 0.6580718	total: 138ms	remaining: 4m 35s
1:	learn: 0.6298703	total: 230ms	remaining: 3m 49s
2:	learn: 0.6072111	total: 322ms	remaining: 3m 34s
3:	learn: 0.5889982	total: 414ms	remaining: 3m 26s
4:	learn: 0.5743639	total: 513ms	remaining: 3m 24s
5:	learn: 0.5625637	total: 603ms	remaining: 3m 20s
6:	learn: 0.5530566	total: 689ms	remaining: 3m 16s
7:	learn: 0.5454104	total: 784ms	remaining: 3m 15s
8:	learn: 0.5392238	total: 882ms	remaining: 3m 15s
9:	learn: 0.5342211	total: 972ms	remaining: 3m 13s
10:	learn: 0.5301457	total: 1.06s	remaining: 3m 12s
11:	learn: 0.5268757	total: 1.15s	remaining: 3m 11s
12:	learn: 0.5242039	total: 1.24s	remaining: 3m 9s
13:	learn: 0.5220363	total: 1.33s	remaining: 3m 8s
14:	learn: 0.5202507	total: 1.42s	remaining: 3m 7s
15:	learn: 0.5188851	total: 1.5s	remaining: 3m 6s
16:	learn: 0.51



Valid log-loss is 0.5084587895255492
Valid AUC is 0.8140504514066044
Valid log-loss is 0.5089415144259191
Valid AUC is 0.8132048676403175
Learning rate set to 0.093011
0:	learn: 0.6582268	total: 91ms	remaining: 3m 1s
1:	learn: 0.6301463	total: 191ms	remaining: 3m 10s
2:	learn: 0.6075675	total: 285ms	remaining: 3m 9s
3:	learn: 0.5894241	total: 372ms	remaining: 3m 5s
4:	learn: 0.5748242	total: 471ms	remaining: 3m 7s
5:	learn: 0.5630636	total: 561ms	remaining: 3m 6s
6:	learn: 0.5536072	total: 651ms	remaining: 3m 5s
7:	learn: 0.5459646	total: 743ms	remaining: 3m 4s
8:	learn: 0.5397893	total: 834ms	remaining: 3m 4s
9:	learn: 0.5348005	total: 925ms	remaining: 3m 3s
10:	learn: 0.5307636	total: 1.02s	remaining: 3m 4s
11:	learn: 0.5274919	total: 1.12s	remaining: 3m 5s
12:	learn: 0.5248344	total: 1.21s	remaining: 3m 4s
13:	learn: 0.5226478	total: 1.29s	remaining: 3m 3s
14:	learn: 0.5208731	total: 1.39s	remaining: 3m 4s
15:	learn: 0.5194820	total: 1.49s	remaining: 3m 4s
16:	learn: 0.5182699	total



Valid log-loss is 0.5097983286020547
Valid AUC is 0.8128374867961028
Valid log-loss is 0.5102146583888731
Valid AUC is 0.812064302187439
Learning rate set to 0.093011
0:	learn: 0.6581314	total: 83.6ms	remaining: 2m 47s
1:	learn: 0.6299810	total: 166ms	remaining: 2m 45s
2:	learn: 0.6073613	total: 260ms	remaining: 2m 52s
3:	learn: 0.5891643	total: 350ms	remaining: 2m 54s
4:	learn: 0.5745494	total: 454ms	remaining: 3m 1s
5:	learn: 0.5627912	total: 550ms	remaining: 3m 2s
6:	learn: 0.5533191	total: 640ms	remaining: 3m 2s
7:	learn: 0.5456665	total: 726ms	remaining: 3m
8:	learn: 0.5394879	total: 817ms	remaining: 3m
9:	learn: 0.5344944	total: 909ms	remaining: 3m
10:	learn: 0.5304577	total: 1s	remaining: 3m 1s
11:	learn: 0.5271778	total: 1.09s	remaining: 3m
12:	learn: 0.5245267	total: 1.18s	remaining: 3m
13:	learn: 0.5223481	total: 1.28s	remaining: 3m 1s
14:	learn: 0.5205846	total: 1.36s	remaining: 2m 59s
15:	learn: 0.5191079	total: 1.46s	remaining: 3m
16:	learn: 0.5179354	total: 1.54s	remainin



Valid log-loss is 0.508592728157079
Valid AUC is 0.8134012906552728
Valid log-loss is 0.5090037922697467
Valid AUC is 0.812786647498434
Learning rate set to 0.093011
0:	learn: 0.6582014	total: 87.2ms	remaining: 2m 54s
1:	learn: 0.6301109	total: 180ms	remaining: 2m 59s
2:	learn: 0.6075284	total: 276ms	remaining: 3m 3s
3:	learn: 0.5893833	total: 360ms	remaining: 2m 59s
4:	learn: 0.5747816	total: 450ms	remaining: 2m 59s
5:	learn: 0.5630182	total: 531ms	remaining: 2m 56s
6:	learn: 0.5535613	total: 618ms	remaining: 2m 56s
7:	learn: 0.5459234	total: 706ms	remaining: 2m 55s
8:	learn: 0.5397533	total: 793ms	remaining: 2m 55s
9:	learn: 0.5347674	total: 877ms	remaining: 2m 54s
10:	learn: 0.5307368	total: 959ms	remaining: 2m 53s
11:	learn: 0.5274410	total: 1.04s	remaining: 2m 52s
12:	learn: 0.5247943	total: 1.13s	remaining: 2m 52s
13:	learn: 0.5226376	total: 1.22s	remaining: 2m 52s
14:	learn: 0.5209460	total: 1.31s	remaining: 2m 53s
15:	learn: 0.5195046	total: 1.39s	remaining: 2m 52s
16:	learn: 0



Valid log-loss is 0.5078269726162877
Valid AUC is 0.8138386718432461
Valid log-loss is 0.5082532843485519
Valid AUC is 0.8131253951692738
Learning rate set to 0.093011
0:	learn: 0.6582613	total: 88.6ms	remaining: 2m 57s
1:	learn: 0.6302014	total: 173ms	remaining: 2m 53s
2:	learn: 0.6076565	total: 270ms	remaining: 2m 59s
3:	learn: 0.5895319	total: 363ms	remaining: 3m 1s
4:	learn: 0.5749662	total: 472ms	remaining: 3m 8s
5:	learn: 0.5632232	total: 577ms	remaining: 3m 11s
6:	learn: 0.5537618	total: 689ms	remaining: 3m 16s
7:	learn: 0.5461482	total: 796ms	remaining: 3m 18s
8:	learn: 0.5399841	total: 888ms	remaining: 3m 16s
9:	learn: 0.5350007	total: 984ms	remaining: 3m 15s
10:	learn: 0.5309709	total: 1.07s	remaining: 3m 14s
11:	learn: 0.5276914	total: 1.16s	remaining: 3m 12s
12:	learn: 0.5250210	total: 1.26s	remaining: 3m 12s
13:	learn: 0.5228442	total: 1.34s	remaining: 3m 10s
14:	learn: 0.5210589	total: 1.42s	remaining: 3m 8s
15:	learn: 0.5196828	total: 1.52s	remaining: 3m 8s
16:	learn: 0.

  return array(a, dtype, copy=False, order=order)


In [29]:
meta_learner = LogisticRegression(n_jobs=-1)


In [30]:
meta_learner.fit(X_meta, y_meta)

LogisticRegression(n_jobs=-1)

In [31]:
X_preds = meta_learner.predict_proba(X_meta)[:,1]

In [32]:
full_dataset_auc = roc_auc_score(y_meta, X_preds)

In [33]:
wandb.log({'blender-stack_mdethod': 'predict_proba', 
           'blender-final_estimator': str(meta_learner),
           'blender-final_estimator_params': meta_learner.get_params(),
           'ensemble_full-dataset_auc': full_dataset_auc,
           'blender-cv': None
          })

In [36]:
Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds").mkdir(exist_ok=True)
dump(meta_learner, Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/meta-model_LogisticRegression_fit-on-full-dataset.joblib"))

['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/super-learner_manual_20210914_094720_5folds/meta-model_LogisticRegression_fit-on-full-dataset.joblib']

# Test set preprocessing


(Here's where encapsulating the transformations in a pipeline would come in handy. But I'll do it manually for now.)

In [None]:
# features = [x for x in test_df.columns if x != 'claim']
# X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [None]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_test_poly = poly.fit_transform(X_test)

In [None]:
# X_test_poly_names = poly.get_feature_names(X_test.columns)
# X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [None]:
# checks = [feature in X_test_poly_names for feature in features]
# checks

In [None]:
# X_test_final = pd.DataFrame(X_test_poly, columns=X_test_poly_names)

In [None]:
# X_test_final = X_test_final[features[1:]]
# X_test_final = X_test

In [None]:
# X_test['nan_count'] = X_test.isnull().sum(axis=1)

In [None]:
# imputer = SimpleImputer(strategy='median', add_indicator=True)
# X_test_imputed_np = imputer.fit_transform(X_test)

In [None]:
# X_test_imputed = pd.DataFrame(X_test_imputed, columns=[str(x) for x in range(X_test_imputed.shape[1])])
# X_test_imputed.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators.feather')

In [None]:
# scaler = exmodel_config['scaler']()
# X_test_imputed_scaled_np = scaler.fit_transform(X_test_imputed)
# X_test_imputed_scaled = pd.DataFrame(X_test_imputed_scaled_np, columns=X_test_imputed.columns)
# X_test_imputed_scaled.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [37]:
X_test = pd.read_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')

Now, you have to generate predictions for the test set using the constituent estimators, and then blend them together for the final prediction.

In [38]:
X_test_meta = [] 
# y_test_meta = [], []
X_test_np = X_test.to_numpy()
#         X_train, X_valid = X_np[train_ids], X_np[valid_ids]
#         y_train, y_valid = y_np[train_ids], y_np[valid_ids]
#         print(f"type(X_train) = {type(X_train)}")
#         print(f"type(y_trian) = {type(y_train)}")
#         # append the y_valid to y_meta
# #             pd.concat(y_meta, y_valid, axis=0) # building this "meta" y as we go - the valid for each fold go at the bottom of the col/pd.Series
#         y_meta.append(y_valid) # assumes y_meta is a list
# #             y_meta = pd.concat(y_meta, pd.Series(y_valid))

base_preds = [] # serving the same purpose as the `fold_preds` in the super_train func
for model in model_list: # presumes tuples of the form (str, model)
#     model[1].fit(X_train, y_train)
    y_pred = model[1].predict_proba(X_test_np)[:,1]
    base_preds.append(y_pred.reshape(len(y_pred), 1))

X_test_meta.append(np.hstack(base_preds))
#         dump(fold_preds, Path(preds_path/f"all_fold{fold}_oof-preds.joblib"))
#                     dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))

#                 wandb.finish()
#     return np.vstack(X_meta), np.asarray(y_meta)
# at this point, X_meta is a length-5 list of np.ndarrays; each array has as many columns as estimators. The number of rows across folds sum to the number in the dataset.
# meanwhile, y_meta is a length-5 list of np.ndarrays containing the labels for each fold.
# on the way out, let's get them into the proper format
#     return X_meta, y_meta 

X_test_base = np.vstack(X_test_meta)
# np.vstack(X_meta), np.hstack(np.asarray(y_meta))



## Prediction Generation

In [39]:
meta_preds = meta_learner.predict_proba(X_test_base)

In [42]:
meta_preds

array([[0.37975558, 0.62024442],
       [0.86840193, 0.13159807],
       [0.38051642, 0.61948358],
       ...,
       [0.24316148, 0.75683852],
       [0.87790954, 0.12209046],
       [0.19314858, 0.80685142]])

In [None]:
# preds_path = Path(datapath/"preds/")

# blender_preds = blender.predict_proba(X_test_imputed_scaled)#[:,1]
# dump(blender_preds, preds_path/f"{config_run['name']}_stack.joblib")

# Submission

In [44]:
sample_df = pd.read_csv(datapath/'sample_solution.csv')

In [45]:
sample_df.loc[:, 'claim'] = meta_preds[:,1]

In [46]:
sample_df.head()

Unnamed: 0,id,claim
0,957919,0.620244
1,957920,0.131598
2,957921,0.619484
3,957922,0.127216
4,957923,0.142606


In [47]:
submission_path = datapath/'submissions'
submission_path.mkdir(exist_ok=True)

In [48]:
sample_df.to_csv(submission_path/f"{config_run['name']}_super-learner_LogisticRegression_full-dataset.csv", index=False)

In [49]:
wandb.log({'leaderboard_auc': 0.81560 })

In [50]:
wandb.finish()

VBox(children=(Label(value=' 2.24MB of 2.24MB uploaded (0.73MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.47448
train_auc,0.86535
_runtime,4173
_timestamp,1631642214
_step,61
xgboost_params,{'objective': 'binar...
valid_loss,0.50711
valid_auc,0.81491
fold,4
lightgbm_params,{'boosting_type': 'g...


0,1
train_loss,▇█▁██▁██▁██▁██▁
train_auc,▂▁█▂▁█▂▁█▂▁█▂▁█
_runtime,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇▇▇▇█
_timestamp,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇▇▇▇█
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid_loss,▇█▆▃▄▂▅▆▄▃▄▃▂▃▁
valid_auc,▃▁▄▆▅█▄▂▅▅▄▆▆▄█
fold,▁▁▁▃▃▃▅▅▅▆▆▆███
ensemble_full-dataset_auc,▁
leaderboard_auc,▁


## Manual Stacking

In [73]:
X.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,227,228,229,230,231,232,233,234,235,236
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,7.821398,-0.12703


In [119]:
X1 = X.copy()

In [120]:
X1.shape

(957919, 237)

In [121]:
# generate probability predictions for the XGBoost model's folds
for fold in xgboost_models.keys():
#     X1[f"xgboost_fold{fold}_pred"] = xgboost_models[fold].predict(X)
    X1[f"xgboost_fold{fold}_pred"] = xgboost_models[fold].predict_proba(X)[:,1]
#     xgboost_preds[fold] = xgboost_models[fold].predict(X_test_imputed_scaled)



In [122]:
X1.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,232,233,234,235,236,xgboost_fold0_pred,xgboost_fold1_pred,xgboost_fold2_pred,xgboost_fold3_pred,xgboost_fold4_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.582566,0.58095,0.576743,0.569523,0.595877
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.152252,0.150803,0.148316,0.155218,0.147297
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.794083,0.789945,0.788326,0.787177,0.797979
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.774001,0.76851,0.774555,0.782187,0.773245
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.127985,-0.128494,-0.12862,7.821398,-0.12703,0.759366,0.755764,0.763769,0.758034,0.758038
