# Baseline
Setting up a more robust baseline notebook, suitable for use with all of the "Big Three" (XGBoost, CatBoost, LightGBM) libraries and on either Google Colab or the local machine.

# Setup

In [1]:
# two manual flags (ex-config)
colab = False
gpu_available = False
libraries = ['xgboost', 'lightgbm', 'catboost']

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"stacking_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if colab:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
    !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

    !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if gpu_available: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if gpu_available:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer
# import timm

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


Now, datapath setup

In [6]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [7]:
if colab:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/sep2021/')
    
else:
    # if on local machine
    datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')    
    


## Ex-Model Config

In [8]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
    # model config
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
    "scaler": StandardScaler, # TODO: experiment with others (but imputation may be slow)
    "scale_b4_impute": False,
    "imputer": SimpleImputer(strategy='median', add_indicator=True),
    "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
#     'subsample': 1,
    'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [9]:
# if exmodel_config['scaler']:
#     scaler = exmodel_config['scaler']()
#     scaler.fit_transform()

In [10]:
# # here's how to load the original, unaltered dataset and separate features from targets
# df = pd.read_feather(path=datapath/'dataset_df.feather') # this is the unaltered original dataset
# features = [x for x in df.columns if x != 'claim']
# X = df[features]
# y = df.claim



# load the version of the dataset with imputations; X and y were stored separately, as feather and joblib respectively
X = pd.read_feather(datapath/'X_NaNcounts_imputed-Median-wIndicators-StandardScaled.feather') 
y = load(datapath/'y.joblib')    
X.index.name = 'id'
y.index.name = 'id'

exmodel_config['feature_count'] = len(X.columns)
exmodel_config['feature_generator'] = None

In [11]:
# scaler = exmodel_config['scaler']()
# X_scaled = scaler.fit_transform(X)
# X = pd.DataFrame(X_scaled, columns=X.columns)

In [12]:
X.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,227,228,229,230,231,232,233,234,235,236
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,7.821398,-0.12703


In [13]:
y.head()

id
0    1
1    0
2    1
3    1
4    1
Name: claim, dtype: int64

### Model Config

In [14]:
def model_configurator(library, gpu_available=True):#, config=universal_config):
    """
    Function that provide task-specific or general preference arguments for the various models. 
    
    At first, will rely largely on defaults for hyperparameters, but later this function 
    can be supplemented later with optimal values, as they're learned in sweeps.
    .
    
    Rationale: creating a helper function will allow more experimentation later, and also
    composite runs that cycle through a series of models.
    
    :param model: A model from [XGBClassifier, LGBMClassifier, CatBoostClassifier]
    :return config: A dict that supplements default hyperparameter values with 1) 
                    task-appropriate ones, and perhaps later 2) optimal hyperparameter values.
    """
    config = {}
    
    # library-specific config
    if library in ['xgboost', 'lightgbm']:
        config['n_jobs'] = -1
        
    # best params per sweep `icac24c5`, generated from notebook `sweep_20210905.ipynb`
    # runtime per fold should be around 12m 38s
    # should get auc of 0.7434 on the random_state=42 holdout
    # haven't yet tried dart
    if library == 'xgboost':
#         config['tree_method'] = 'auto'
#         config['booster'] = 'gbtree' # or 'dart'
#         config['model'] = XGBClassifier
        config['verbosity'] = 1
        config['objective'] = 'binary:logistic'
#         config['eval_metric'] = ['auc', 'logloss', 'aucpr'],
        config['tree_method'] = 'gpu_hist' if (gpu_available and colab) else 'auto' 
        
        # comment out the below to get defaults
        config['n_estimators'] = 902
        config['learning_rate'] = 0.0304
        config['max_depth'] = 3
        config['reg_alpha'] = 0.863
        config['reg_lambda'] = 2.442
        config['subsample'] = 0.8627

    # best params per sweep `sjghewf0`, generated from notebook `sweep_lightgbm_20210907`
    # run name `sweep_lightgbm_20210907_195641`
    # runtime per fold should be around 39s
    # should get an auc of 0.7435 on random_state=42 holdout
    if library == 'lightgbm':
#         config['model'] = LGBMClassifier
        config['objective'] = 'binary'
        config['eval_metric'] = ['auc', 'logloss']
        config['boosting_type'] = 'gbdt' # or 'dart'
        config['device_type'] = 'cuda' if (gpu_available and colab) else 'cpu' # 'gpu' also possible, 'cpu' is default
        
        # comment out the below for defaults
        config['n_estimators'] = 1286
        config['learning_rate'] = 0.03221
        config['max_depth'] = 2
        config['reg_alpha'] = 0.4687
        config['reg_lambda'] = 0.1763
        config['subsample'] = 0.6621
        

#     if config['model'] == CatBoostClassifier:
    if library == 'catboost':
#         config['model'] = CatBoostClassifier
        config['task_type'] = 'GPU' if gpu_available else 'CPU'
        config['custom_metrics'] = ['Logloss', 'AUC'] # objective (loss fn) must be singular, defaults to Logloss
        config['n_estimators'] = 2000 # logged as "iterations" otherwise

    return config

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [15]:
# wandb config:
config_run = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['attempt'],
    'notes': "Attempting sweep-best XGBoost and LightGBM plus defaultish CatBoost on newly preprocessed data (with a column containing NaN counts, median imputation w/indicators, and standard scaling)",
}

# Preprocessing
Scaling has already occurred -- used `StandardScaler` as a precursor to using `KNNImputer(n_neighbors=5)`, on the premise that imputation would proceed more quickly if things were already scaled. I may try different permutations of this later: using `IterativeImputer` instead, before or after scaling, potentially with different scalers. 

# Feature Creation and Selection

In [16]:
# load all the polynomialfeatures generated with `PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)`
# X_np = np.load(datapath/'X_poly_unscaled.npy')
# X = pd.DataFrame(X_np)

In [17]:
# X.columns

In [18]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_poly = poly.fit_transform(X)

In [19]:
# X_poly_names = poly.get_feature_names(X.columns)
# # X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [20]:
# checks = [feature in X_poly_names for feature in features]
# checks

In [21]:
# X = pd.DataFrame(X_poly, columns=X_poly_names)

In [22]:
# X = X[features[1:]]

# Training

In [23]:
def train(X_train, X_valid, y_train, y_valid, model_config, 
                                              random_state=42,
                                              exmodel_config=exmodel_config, 
                                              config_run=config_run):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param X_train: the training set features
    :param X_valid: the validation set features
    :param y_train: the training set targets
    :param y_valid: the validation set targets
    :param random_staKFold: for reproducibility
    :param exmodel_config: dict containing configuration details including the library 
                            (thus model) used, preprocessing, and cross-validation
    :param model_config: dict containing hyperparameter specifications for the model
    :param config_run: dict containing wandb run configuration (name, etc)
    """
    
    
    wandb.init(
        project="202109_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=exmodel_config)   
        
    if exmodel_config['library'] == 'xgboost':
        model = XGBClassifier(
            tree_method=model_config['tree_method'],
            random_state=random_state,
            n_jobs=model_config['n_jobs'], 
            verbosity=model_config['verbosity'], 
            objective=model_config['objective'],
            # #             eval_metric=model_config['eval_metric'],

            # comment out the below for a fairly default model
#             booster=model_config['booster'],
            max_depth=model_config['max_depth'],
            learning_rate=model_config['learning_rate'], 
            subsample=model_config['subsample'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            n_estimators=model_config['n_estimators'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )


    elif exmodel_config['library'] == 'lightgbm':
        model = LGBMClassifier(
#             boosting_type=model_config['boosting_type'],
#             max_depth=model_config['max_depth']
            # TODO
            random_state=random_state,
            n_jobs=model_config['n_jobs'],
            objective=model_config['objective'],
#             eval_metric=model_config['eval_metric'],
            boosting_type=model_config['boosting_type'],
            device_type=model_config['device_type'],
            
            # comment out the below for a basically default model
            n_estimators=model_config['n_estimators'],
            learning_rate=model_config['learning_rate'],
            max_depth=model_config['max_depth'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            subsample=model_config['subsample'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )
        
    elif exmodel_config['library'] == 'catboost':
        print("CatBoost, therefore no WandB callback.")
        model = CatBoostClassifier(
#             n_estimators=config['n_estimators'],
#             learning_rate=config['learning_rate'],
#             max_depth=config['max_depth'],
            task_type=model_config['task_type'],
    #         n_jobs=config['n_jobs'],
    #         verbosity=config['verbosity'],
    #         subsample=config['subsample'],
            n_estimators=model_config['n_estimators'],
            random_state=random_state,
            # objective='Logloss', # default, accepts only one
#             custom_metrics=model_config['custom_metrics'],
    #         bootstrap_type=config['bootstrap_type'],
    #         device:config['device']
        ) 
        model.fit(X_train, y_train)
        
#     y_train_pred = model.predict(X_train)
    y_train_pred = model.predict_proba(X_train)[:,1]

    train_loss = log_loss(y_train, y_train_pred)
    train_auc = roc_auc_score(y_train, y_train_pred)
    wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

    if exmodel_config['library'] == 'catboost':
        print(model.get_all_params())
        wandb.log(model.get_all_params())
    else:
        wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()
    
    # trying with predict_proba
    y_pred = model.predict_proba(X_valid)[:,1]
#     y_pred = model.predict(X_valid)

    valid_loss = log_loss(y_valid, y_pred)
    valid_auc = roc_auc_score(y_valid, y_pred)
    wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
    print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
#     wandb.finish()   
    return model
    

In [24]:
def cross_validation(model_config, X=X, y=y, start_fold=0, exmodel_config=exmodel_config, random_state=42):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
    if exmodel_config['kfolds'] == 1:
        print("Proceeding with holdout")
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=exmodel_config['test_size'], 
                                                      random_state=random_state,
                                                     )
        model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
        wandb.finish()
        
    else:
        X, y = X.to_numpy(), y.to_numpy()
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)
        models = {}
        model_path = Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/")
        (model_path).mkdir(exist_ok=True)
        for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
            if fold < start_fold:
                continue
            else:
                print(f"FOLD {fold}")
                print("---------------------------------------------------")
                X_train, X_valid = X[train_ids], X[valid_ids]
                y_train, y_valid = y[train_ids], y[valid_ids]
                model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
                wandb.log({'fold': fold})
                models[fold] = model
                dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))
                wandb.finish()
        return models
        

# Interface

## Runs

In [36]:
library = 'xgboost'
exmodel_config['library'] = library
model_config = model_configurator(library)
xgboost_models = cross_validation(model_config)

FOLD 0
---------------------------------------------------




Valid log-loss is 0.5109356389636417
Valid AUC is 0.8122584973566012


VBox(children=(Label(value=' 0.15MB of 0.15MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50573
train_auc,0.82113
_runtime,488
_timestamp,1631294369
_step,905
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁▁
_timestamp,▁▁▁▁
_step,▁▃▆█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


FOLD 1
---------------------------------------------------




Valid log-loss is 0.5084587895255492
Valid AUC is 0.8140504514066044


VBox(children=(Label(value=' 0.15MB of 0.15MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.5064
train_auc,0.82071
_runtime,485
_timestamp,1631294872
_step,905
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


FOLD 2
---------------------------------------------------




Valid log-loss is 0.5097983286020547
Valid AUC is 0.8128374867961028


VBox(children=(Label(value=' 0.16MB of 0.16MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50606
train_auc,0.82094
_runtime,529
_timestamp,1631295407
_step,905
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁▁
_timestamp,▁▁▁▁
_step,▁▃▆█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


FOLD 3
---------------------------------------------------




Valid log-loss is 0.508592728157079
Valid AUC is 0.8134012906552728


VBox(children=(Label(value=' 0.17MB of 0.17MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50633
train_auc,0.82084
_runtime,482
_timestamp,1631295894
_step,905
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁▁
_timestamp,▁▁▁▁
_step,▁▃▆█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


FOLD 4
---------------------------------------------------




Valid log-loss is 0.5078269726162877
Valid AUC is 0.8138386718432461


VBox(children=(Label(value=' 0.18MB of 0.18MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50649
train_auc,0.82061
_runtime,539
_timestamp,1631296439
_step,905
objective,binary:logistic
use_label_encoder,True
base_score,0.5
booster,gbtree
colsample_bylevel,1


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁▁▁
_timestamp,▁▁▁▁
_step,▁▃▆█
use_label_encoder,▁
base_score,▁
colsample_bylevel,▁
colsample_bynode,▁
colsample_bytree,▁


In [26]:
# for scaler in [StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler]:
#     exmodel_config['scaler'] = scaler
#     scaler = scaler()
#     X_scaled = scaler.fit_transform(X)
#     X = pd.DataFrame(X_scaled, columns=X.columns)
#     exmodel_config['library'] = 'lightgbm'
#     model_config = model_configurator('lightgbm')
#     cross_validation(model_config)

In [27]:
library = 'lightgbm'
exmodel_config['library'] = library
model_config = model_configurator(library)
lightgbm_models = cross_validation(model_config)

FOLD 0
---------------------------------------------------


[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


Valid log-loss is 0.5113911181729013
Valid AUC is 0.8114826365905788


VBox(children=(Label(value=' 0.10MB of 0.10MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50759
train_auc,0.81709
_runtime,43
_timestamp,1631290946
_step,1289
boosting_type,gbdt
colsample_bytree,1.0
importance_type,split
learning_rate,0.03221
max_depth,2


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
colsample_bytree,▁
learning_rate,▁
max_depth,▁
min_child_samples,▁
min_child_weight,▁


FOLD 1
---------------------------------------------------


Valid log-loss is 0.5089415144259191
Valid AUC is 0.8132048676403175


VBox(children=(Label(value=' 0.11MB of 0.11MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50828
train_auc,0.81655
_runtime,44
_timestamp,1631290995
_step,1289
boosting_type,gbdt
colsample_bytree,1.0
importance_type,split
learning_rate,0.03221
max_depth,2


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
colsample_bytree,▁
learning_rate,▁
max_depth,▁
min_child_samples,▁
min_child_weight,▁


FOLD 2
---------------------------------------------------


Valid log-loss is 0.5102146583888731
Valid AUC is 0.812064302187439


VBox(children=(Label(value=' 0.11MB of 0.11MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50791
train_auc,0.8169
_runtime,45
_timestamp,1631291045
_step,1289
boosting_type,gbdt
colsample_bytree,1.0
importance_type,split
learning_rate,0.03221
max_depth,2


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
colsample_bytree,▁
learning_rate,▁
max_depth,▁
min_child_samples,▁
min_child_weight,▁


FOLD 3
---------------------------------------------------


Valid log-loss is 0.5090037922697467
Valid AUC is 0.812786647498434


VBox(children=(Label(value=' 0.12MB of 0.12MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.5082
train_auc,0.8168
_runtime,44
_timestamp,1631291095
_step,1289
boosting_type,gbdt
colsample_bytree,1.0
importance_type,split
learning_rate,0.03221
max_depth,2


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
colsample_bytree,▁
learning_rate,▁
max_depth,▁
min_child_samples,▁
min_child_weight,▁


FOLD 4
---------------------------------------------------


Valid log-loss is 0.5082532843485519
Valid AUC is 0.8131253951692738


VBox(children=(Label(value=' 0.12MB of 0.12MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.50839
train_auc,0.81663
_runtime,44
_timestamp,1631291144
_step,1289
boosting_type,gbdt
colsample_bytree,1.0
importance_type,split
learning_rate,0.03221
max_depth,2


0,1
train_loss,▁
train_auc,▁
_runtime,▁▁██
_timestamp,▁▁██
_step,▁▃▆█
colsample_bytree,▁
learning_rate,▁
max_depth,▁
min_child_samples,▁
min_child_weight,▁


# Stacking

## Via `sklearn.ensemble.StackingClassifier`

In [28]:
from sklearn.ensemble import StackingClassifier

In [29]:
# xgboost_estimators = [(f'xgboost_fold{fold}', xgboost_models[fold]) for fold in range(5)]

In [30]:
# leaving this default for first try
# final_estimator = 

In [39]:
# revision below

def stacker(estimators:dict, library:str, X=X, y=y): #, load_models:bool=False, load_path:Path=None):
    """
    A wrapper that will take a dict of the form {fold:int : model} and a string representing the library (for file-naming), 
    then run `sklearn.ensemble.StackingClassifier` with it, and save the stacked model afterward
    """
    estimators_list = [(f'{library}_fold{fold}', estimators[fold]) for fold in range(5)]
    blender = StackingClassifier(estimators=estimators_list,
                                 cv=5,
                                 stack_method='predict_proba',
                                 n_jobs=2,
                                 passthrough=False,
                                 verbose=1
                                )
    print(f"Starting fitting at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    blender.fit(X,y)
    print(f"Fitting complete at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    dump(blender, filename=datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/{library}_stack.joblib")
    print(f"Blender model saved at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    return blender
    

In [35]:
stacker(lightgbm_models, 'lightgbm')

Starting fitting at 20210910_093530




Fitting complete at 20210910_095542
Blender model saved at 20210910_095543


StackingClassifier(cv=5,
                   estimators=[('lightgbm_fold0',
                                LGBMClassifier(device_type='cpu',
                                               learning_rate=0.03221,
                                               max_depth=2, n_estimators=1286,
                                               objective='binary',
                                               random_state=42,
                                               reg_alpha=0.4687,
                                               reg_lambda=0.1763,
                                               subsample=0.6621)),
                               ('lightgbm_fold1',
                                LGBMClassifier(device_type='cpu',
                                               learning_rate=0.03221,
                                               max_depth=2, n_estimators=1286,
                                               objective='binary',...
                                              

In [40]:
stacker(xgboost_models, 'xgboost')

Starting fitting at 20210910_122952
Fitting complete at 20210910_185436
Blender model saved at 20210910_185438


StackingClassifier(cv=5,
                   estimators=[('xgboost_fold0',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1, gamma=0,
                                              gpu_id=-1, importance_type='gain',
                                              interaction_constraints='',
                                              learning_rate=0.0304,
                                              max_delta_step=0, max_depth=3,
                                              min_child_weight=1, missing=nan,
                                              monotone_constraints='()',
                                              n_estimators=902, n...
                                              interaction_constraints='',
                                           

In [58]:
wandb.finish()


Problem finishing run
Traceback (most recent call last):
  File "/home/sf/anaconda3/envs/tabular/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 1579, in _atexit_cleanup
    self._on_finish()
  File "/home/sf/anaconda3/envs/tabular/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 1715, in _on_finish
    self.history._flush()
  File "/home/sf/anaconda3/envs/tabular/lib/python3.8/site-packages/wandb/sdk/wandb_history.py", line 59, in _flush
    self._callback(row=self._data, step=self._step)
  File "/home/sf/anaconda3/envs/tabular/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 902, in _history_callback
    self._backend.interface.publish_history(
  File "/home/sf/anaconda3/envs/tabular/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 223, in publish_history
    item.value_json = json_dumps_safer_history(v)  # type: ignore
  File "/home/sf/anaconda3/envs/tabular/lib/python3.8/site-packages/wandb/util.py", line 749, in json_dumps_safer_histo

In [59]:
# might encapsulate this in a new version of the above train function later
exmodel_config['ensemble'] = True

# wandb.init(
#         project="202109_Kaggle_tabular_playground",
#         save_code=True,
#         tags=config_run['tags'],
#         name=config_run['name'],
#         notes=config_run['notes'],
#         config=exmodel_config)   

random_state = exmodel_config['random_state'] # 42

model_config = model_configurator('xgboost')
xgboost_model = XGBClassifier(
            tree_method=model_config['tree_method'],
            random_state=random_state,
#             n_jobs=model_config['n_jobs'], 
            verbosity=model_config['verbosity'], 
            objective=model_config['objective'],
            # #             eval_metric=model_config['eval_metric'],

            # comment out the below for a fairly default model
#             booster=model_config['booster'],
            max_depth=model_config['max_depth'],
            learning_rate=model_config['learning_rate'], 
            subsample=model_config['subsample'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            n_estimators=model_config['n_estimators'],
        )

model_config = model_configurator('lightgbm')
lightgbm_model = LGBMClassifier(
            random_state=random_state,
#             n_jobs=model_config['n_jobs'],
            objective=model_config['objective'],
            boosting_type=model_config['boosting_type'],
            device_type=model_config['device_type'],
            
            # comment out the below for a basically default model
            n_estimators=model_config['n_estimators'],
            learning_rate=model_config['learning_rate'],
            max_depth=model_config['max_depth'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            subsample=model_config['subsample'],
        )

model_config = model_configurator('catboost')
catboost_model = CatBoostClassifier(
            task_type=model_config['task_type'],
            n_estimators=model_config['n_estimators'],
            random_state=random_state,
        ) 



estimators_list = [
    ('xgboost', xgboost_model),
    ('lightgbm', lightgbm_model),
    ('catboost', catboost_model)
]

# wandb.log({'estimators': estimators_list})


blender = StackingClassifier(estimators=estimators_list,
                                 cv=5,
                                 stack_method='predict_proba',
                                 n_jobs=2,
                                 passthrough=False,
                                 verbose=1
                            )

# wandb.log({'blender': blender})

print(f"Starting fitting at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
blender.fit(X,y) # unsure of this -- given kwarg cv=5, is it producing the splits? Or do I have to somehow?
print(f"Fitting complete at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
dump(blender, filename=datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/{library}_stack.joblib")
print(f"Blender model saved at {datetime.now().strftime('%Y%m%d_%H%M%S')}")

Starting fitting at 20210910_212035


CatBoostError: catboost/cuda/cuda_lib/devices_provider.h:185: Error: device already requested 0

In [None]:
?b

In [None]:
# xgboost_stack = StackingClassifier(estimators=xgboost_estimators, 
#                                    cv=5, 
#                                    stack_method='predict_proba', 
#                                    n_jobs=-1, 
#                                    passthrough=False, 
#                                    verbose=1)
# xgboost_stack.fit(X,y)
# dump(xgboost_stack, filename=datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/xgboost_stack.joblib")

In [None]:
print(datetime.now().strftime('%Y%m%d_%H%M%S')

In [None]:
lightgbm_estimators = [(f'lightgbm_fold{fold}', lightgbm_models[fold]) for fold in range(5)]
lightgbm_stack = StackingClassifier(estimators=lightgbm_estimators, 
                                   cv=5, 
                                   stack_method='predict_proba', 
                                   n_jobs=-1, 
                                   passthrough=False, 
                                   verbose=1)
lightgbm_stack.fit(X,y)
dump(xgboost_stack, filename=datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/lightgbm_stack.joblib")
print(datetime.now().strftime('%Y%m%d_%H%M%S')

In [None]:
# library = 'catboost'
# gpu_available = True
# exmodel_config['library'] = library
# model_config = model_configurator(library)
# catboost_models = cross_validation(model_config)

In [None]:
# # this loads models if you need to (or forgot to save them on training above)
# xgboost_models = {}
# xgboost_models_path = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/baseline_20210905a_152521_5folds/xgboost/')
# for fold in range(5):
#     xgboost_models[fold] = load(xgboost_models_path/f'xgboost_fold{fold}_model.joblib')

# Inference

In [41]:
test_df = pd.read_csv(datapath/'test.csv', index_col='id', low_memory=False)
# test_df.to_feather(datapath/'test.feather') # issue with index being non-default; fix later
# test_df = pd.read_feather(datapath/'test.feather')

In [42]:
test_df.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
957919,0.16585,0.48705,1295.0,0.0231,0.319,0.90188,573.29,3743.7,2705700000000.0,6221.0,...,0.16253,-22.189,2.0655,0.43088,-10.741,81606.0,1.194,198040000000000.0,2017.1,0.46357
957920,0.12965,0.37348,1763.0,0.72884,0.33247,-1.2631,875.55,554370.0,595570000000000.0,934.43,...,0.81528,-1.6342,1.5736,-1.0712,11.832,90114.0,1.1507,4.388e+16,6638.9,0.28125
957921,0.12019,0.44521,736.26,0.04615,0.29605,0.31665,2659.5,317140.0,397780000000000.0,131.81,...,0.81831,-32.78,2.1364,-1.9312,-3.2804,37739.0,1.1548,171810000000000.0,5844.0,0.13797
957922,0.054008,0.39596,996.14,0.85934,0.36678,-0.1706,386.56,325680.0,-34322000000000.0,-26.473,...,0.86559,-2.4162,1.5199,-0.011633,1.384,26849.0,1.149,2.1388e+17,6173.3,0.3291
957923,0.079947,-0.006919,10574.0,0.34845,0.45008,-1.842,3027.0,428150.0,929150000000.0,5999.4,...,0.2519,-18.63,3.7387,0.75708,-4.9405,50336.0,1.2488,2.1513e+17,2250.1,0.33796


# Test set preprocessing


(Here's where encapsulating the transformations in a pipeline would come in handy. But I'll do it manually for now.)

In [43]:
features = [x for x in test_df.columns if x != 'claim']
X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [None]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_test_poly = poly.fit_transform(X_test)

In [None]:
# X_test_poly_names = poly.get_feature_names(X_test.columns)
# X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [None]:
# checks = [feature in X_test_poly_names for feature in features]
# checks

In [None]:
# X_test_final = pd.DataFrame(X_test_poly, columns=X_test_poly_names)

In [None]:
# X_test_final = X_test_final[features[1:]]
# X_test_final = X_test

In [None]:
X_test['nan_count'] = X_test.isnull().sum(axis=1)

In [None]:
imputer = SimpleImputer(strategy='median', add_indicator=True)
X_test_imputed_np = imputer.fit_transform(X_test)

In [None]:
X_test_imputed = pd.DataFrame(X_test_imputed, columns=[str(x) for x in range(X_test_imputed.shape[1])])
X_test_imputed.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators.feather')

In [None]:
scaler = exmodel_config['scaler']()
X_test_imputed_scaled_np = scaler.fit_transform(X_test_imputed)
X_test_imputed_scaled = pd.DataFrame(X_test_imputed_scaled_np, columns=X_test_imputed.columns)
X_test_imputed_scaled.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [44]:
X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')

## Prediction Generation

In [None]:
xgboost_preds = {}
for fold in xgboost_models.keys():
    xgboost_preds[fold] = xgboost_models[fold].predict_proba(X_test_imputed_scaled)[:,1]

In [45]:
lightgbm_preds = {}
for fold in lightgbm_models.keys():
    lightgbm_preds[fold] = lightgbm_models[fold].predict_proba(X_test_imputed_scaled)[:,1]

In [None]:
catboost_preds = {}
for fold in catboost_models.keys():
    catboost_preds[fold] = catboost_models[fold].predict_proba(X_test_imputed_scaled)[:,1]

In [46]:
preds_path = Path(datapath/f"preds/{config_run['name']}_{exmodel_config['kfolds']}folds_probs/")
preds_path.mkdir(exist_ok=True)

for library in ['xgboost', 'lightgbm', 'catboost']:
    (preds_path/library).mkdir(exist_ok=True)

In [None]:
dump(xgboost_preds, Path(preds_path/'xgboost/xgboost_preds_dict.joblib'))

In [47]:
dump(lightgbm_preds, Path(preds_path/'lightgbm/lightgbm_preds_dict.joblib'))

['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/stacking_20210910_092142_5folds_probs/lightgbm/lightgbm_preds_dict.joblib']

In [None]:
dump(catboost_preds, Path(preds_path/'catboost/catboost_preds_dict.joblib'))

In [48]:
sample_df = pd.read_csv(datapath/'sample_solution.csv')

In [49]:
# final_xgboost_preds = (xgboost_preds[0] + xgboost_preds[1] + xgboost_preds[2] + xgboost_preds[3] + xgboost_preds[4]) / 5
final_lightgbm_preds = (lightgbm_preds[0] + lightgbm_preds[1] + lightgbm_preds[2] + lightgbm_preds[3] + lightgbm_preds[4]) / 5
# final_catboost_preds = (catboost_preds[0] + catboost_preds[1] + catboost_preds[2] + catboost_preds[3] + catboost_preds[4]) / 5

In [None]:
print(final_xgboost_preds[:10])
print(final_lightgbm_preds[:10])
print(final_catboost_preds[:10])

In [None]:
sample_df.loc[:, 'claim'] = final_xgboost_preds

In [None]:
sample_df.head()

In [51]:
submission_path = datapath/'submissions'
submission_path.mkdir(exist_ok=True)

In [None]:
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_prob_xgboost-mean.csv", index=False)

In [52]:
sample_df.loc[:, 'claim'] = final_lightgbm_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_prob_lightgbm-mean.csv", index=False)

In [54]:
lightgbm_stack = load('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/stacking_20210910_092142_5folds/lightgbm_stack.joblib')
lightgbm_stack_preds = lightgbm_stack.predict_proba(X_test_imputed_scaled)[:,1]

In [55]:
sample_df.loc[:, 'claim'] = lightgbm_stack_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_lightgbm_stack_preds.csv", index=False)

In [None]:
sample_df.loc[:, 'claim'] = final_catboost_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_prob_catboost-mean.csv", index=False)

In [None]:
ensemble_preds = (final_xgboost_preds + final_lightgbm_preds + final_catboost_preds) / 3

In [None]:
sample_df.loc[:, 'claim'] = ensemble_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_prob_ensemble-equal_model_and_fold_weight_mean.csv", index=False)

In [None]:
ensemble_preds = 0.4*final_xgboost_preds + 0.3*final_lightgbm_preds + 0.3*final_catboost_preds

In [None]:
sample_df.loc[:, 'claim'] = ensemble_preds
sample_df.to_csv(submission_path/f"{config_run['name']}_{exmodel_config['kfolds']}folds_prob_ensemble-0.4xgboost_0.3lightgbm_0.3catboost-equal_fold_weight_mean.csv", index=False)

## Manual Stacking

In [73]:
X.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,227,228,229,230,231,232,233,234,235,236
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,7.821398,-0.12703


In [119]:
X1 = X.copy()

In [120]:
X1.shape

(957919, 237)

In [121]:
# generate probability predictions for the XGBoost model's folds
for fold in xgboost_models.keys():
#     X1[f"xgboost_fold{fold}_pred"] = xgboost_models[fold].predict(X)
    X1[f"xgboost_fold{fold}_pred"] = xgboost_models[fold].predict_proba(X)[:,1]
#     xgboost_preds[fold] = xgboost_models[fold].predict(X_test_imputed_scaled)



In [122]:
X1.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,232,233,234,235,236,xgboost_fold0_pred,xgboost_fold1_pred,xgboost_fold2_pred,xgboost_fold3_pred,xgboost_fold4_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.582566,0.58095,0.576743,0.569523,0.595877
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.152252,0.150803,0.148316,0.155218,0.147297
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.794083,0.789945,0.788326,0.787177,0.797979
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.774001,0.76851,0.774555,0.782187,0.773245
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.127985,-0.128494,-0.12862,7.821398,-0.12703,0.759366,0.755764,0.763769,0.758034,0.758038
