# Hybrid
Going to attempt a hybrid model after the example of [this Teck Meng Wong notebook](https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series/notebook).

In [1]:
# notebook configuration
# if '/sf/' in pwd:
#     COLAB, SAGE = False, False
# elif 'google.colab' in str(get_ipython()):
#     COLAB, SAGE = True, False # do colab-specific installs later
# else:
#     COLAB, SAGE = False, True
    
CONTEXT = 'local' # or 'colab', 'sage', 'kaggle'
USE_GPU = True 
%config Completer.use_jedi = False

## Imports

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import requests # for telegram notifications
from tqdm.notebook import tqdm

from joblib import dump, load

Now, non-stdlib imports

In [3]:
# model selection
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

# metrics
# from sklearn.metrics import accuracy_score#, log_loss, roc_auc_score

# eda
import missingno
# import doubtlab 

# data cleaning
# from sklearn.impute import SimpleImputer #, KNNImputer
# import cleanlab

# normalization
# from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
# from gauss_rank_scaler import GaussRankScaler

# feature generation
# from sklearn.preprocessing import PolynomialFeatures
# import category_encoders as ce

# models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier

# feature reduction
# from sklearn.decomposition import PCA
# from umap import UMAP

# clustering
# from sklearn.cluster import DBSCAN, KMeans
# import hdbscan

# feature selection
# from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
# import featuretools as ft
# from BorutaShap import BorutaShap
# from boruta import BorutaPy

# tracking 
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
os.environ['WANDB_NOTEBOOK_NAME'] = f"nb_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# deep learning
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR

# widedeep
# from pytorch_widedeep import Trainer
# from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
# from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT#, TabTransformer, TabNet, TabFastFormer, TabResnet
# from pytorch_widedeep.metrics import Accuracy
# from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [5]:
# # time series
# import tsfresh

# import darts
# from darts import TimeSeries
import holidays
import dateutil.easter as easter

In [6]:
# from darts.models import ExponentialSmoothing, AutoARIMA, ARIMA, Prophet, RandomForest, RegressionEnsembleModel, RegressionModel, TFTModel, TCNModel, TransformerModel, NBEATSModel

## Routing

Now, datapath setup

In [7]:
if CONTEXT == 'colab':
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    # datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/dec2021/')
    root = Path('') # TODO

elif CONTEXT == 'sage':
    root = Path('') # TODO
    
elif CONTEXT == 'kaggle':
    root = Path('') # TODO
    
else: # if on local machine
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    studypath = root/'studies'
    
    for pth in [datapath, predpath, subpath, studypath]:
        pth.mkdir(exist_ok=True)

## Helpers

In [8]:
SEED = 42

# Function to seed everything but the models
def seed_everything(seed, pytorch=True, reproducible=True):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if pytorch:
        torch.manual_seed(seed) # set torch CPU seed
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed) # set torch GPU(s) seed(s)
        if reproducible and torch.backends.cudnn.is_available():
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

seed_everything(seed=SEED)

In [9]:
def reduce_memory_usage(df, verbose=True):
    """
    Function to reduce memory usage by downcasting datatypes in a Pandas DataFrame when possible.
    
    h/t to Bryan Arnold (https://www.kaggle.com/puremath86/label-correction-experiments-tps-nov-21)
    """
    
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [10]:
tg_api_token = 'your_api_token' # for Galileo (jupyter_watcher_bot) on Telegram
tg_chat_id = 'your_chat_id'

import requests

def send_tg_message(text='Cell execution completed.'):  
    """
    h/t Ivan Dembicki Jr. for the base version 
    (https://medium.com/@ivan.dembicki.jr/notifications-in-jupyter-notebook-with-telegram-f2e892c55173)
    """
    requests.post('https://api.telegram.org/' +  'bot{}/sendMessage'.format(tg_api_token),
                  params=dict(chat_id=tg_chat_id, text=text))

In [11]:
def SMAPE(y_true, y_pred):
    '''
    h/t Jean-François Puget (@CPMP) -- see https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
    '''
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [12]:
# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/282735
def better_than_median(inputs, axis):
    """Compute the mean of the predictions if there are no outliers,
    or the median if there are outliers.

    Parameter: inputs = ndarray of shape (n_samples, n_folds)"""
    spread = inputs.max(axis=axis) - inputs.min(axis=axis) 
    spread_lim = 0.45
    print(f"Inliers:  {(spread < spread_lim).sum():7} -> compute mean")
    print(f"Outliers: {(spread >= spread_lim).sum():7} -> compute median")
    print(f"Total:    {len(inputs):7}")
    return np.where(spread < spread_lim,
                    np.mean(inputs, axis=axis),
                    np.median(inputs, axis=axis))

In [13]:
# https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series
def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

In [14]:
# https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series
def fourier_features(index, freq, order):
    time = np.arange(len(index), dtype=np.float32)
    k = 2 * np.pi * (1 / freq) * time
    features = {}
    for i in range(1, order + 1):
        features.update({
            f"sin_{freq}_{i}": np.sin(i * k),
            f"cos_{freq}_{i}": np.cos(i * k),
        })
    return pd.DataFrame(features, index=index)

## Dataset Setup

### Original Data Loading

In [15]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    'train_source': str(datapath/'train.csv'),
    'target_source': str(datapath/'train.csv'),
    'test_source': str(datapath/'test.csv'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
train_df = pd.read_csv(datapath/'train.csv')
test_df = pd.read_csv(datapath/'test.csv')
orig_train_df = train_df.copy()
orig_test_df = test_df.copy()

Since the dates are natively `Object` dtype (i.e. strings), we have to convert them:

In [16]:
# https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model
for df in [train_df, test_df]:
    df['date'] = pd.to_datetime(df.date)

Provisionally, I'm going to concatenate together the `train_df` and `test_df` for preprocessing, to avoid having to constantly apply transforms twice (since I don't anticipate doing any transforms that might allow data leakage to occur).

In [17]:
all_df = pd.concat([train_df, test_df], axis=0)
# all_df.columns
len(all_df) == len(train_df) + len(test_df)

True

In [18]:
all_df.tail()

Unnamed: 0,row_id,date,country,store,product,num_sold
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,
6569,32867,2019-12-31,Sweden,KaggleRama,Kaggle Sticker,


### GDP Data
Here's data from Carl McBride Ellis ([notebook](https://www.kaggle.com/carlmcbrideellis/gdp-of-finland-norway-and-sweden-2015-2019) and [dataset](https://www.kaggle.com/carlmcbrideellis/gdp-20152019-finland-norway-and-sweden) for doing GDP comparisons. They're frequently used in other entries. I've created a function to add them on.

In [19]:
def add_gdp_data(df):
    gdp_df = pd.read_csv(datapath/'GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
    gdp_df.set_index('year', inplace=True)
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]

    df['gdp'] = np.log1p(df.apply(get_gdp, axis=1))
    return df

I'll also define here (but perhaps move later) the GDP exponent, which will be used to transform the targets before inference (dividing num_sold by the $GDP^{1.212}$ and then taking the logarithm (after @ambrosm)

In [20]:
gdp_exponent = 1.2121103201489674 # see https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model for an explanation

In [21]:
all_df = add_gdp_data(all_df)

In [22]:
all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329.0,5.461456
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520.0,5.461456
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146.0,5.461456
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572.0,5.461456
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911.0,5.461456
...,...,...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,,6.282042
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,,6.282042
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,,6.282042
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,,6.282042


### Label Encoding Setup

## Feature Engineering

### Time Features

The goal of this function is to create features that will capture seasonalities -- but **not** trends. The trends will (hopefully) be captured by the deployment of linear forecasting algorithms on raw time series data (consisting exclusively of dates and targets); we want to have seasonalities that the residual models can learn, however -- holidays, weekly patterns, climactic season patterns, etc.

The cell below will generate the `holidays` library's entries for the three countries. I may want to follow the template of @teckmengwong's code below, and add more holidays -- then, do some feature importance checking, and perhaps whittle down the features accordingly.

In [23]:
for c in [holidays.Finland, holidays.Sweden, holidays.Norway]:
#     print(c)
    for h in c(years = [2019], observed=True).items():
#         print(h)
        pass

In [24]:
def temporal_engineering(df):
    '''
    Function inspired by / borrowing from @teckmengwong and @ambrosm to create time features that will
    capture seasonality.
    '''
    
#     df[YEAR] = df[DATE].dt.year
    df['month'] = df['date'].dt.month
#     df['week'] = df['date'].dt.week # not used by Teck Meng Wong
#     df['day'] = df['date'].dt.day # not used by Teck Meng Wong
#     df['day_of_year'] = df['date'].dt.dayofyear # not used by Teck Meng Wong
#     df['day_of_month'] = df['date'].dt.days_in_month # not used by Teck Meng Wong
#     df['day_of_week'] = df['date'].dt.dayofweek # not used by Teck Meng Wong
#    df['weekday'] = df['date'].dt.weekday # not used by Teck Meng Wong
    # Teck Meng Wong mapped the integers to first-letters in triplets
    # I'm leaving it as integers, where winter=1, spring=2, summer=3, fall=4
    df['season'] = ((df['date'].dt.month % 12 + 3) // 3) #.map({1:'DJF', 2: 'MAM', 3:'JJA', 4:'SON'})
#     df['month'] = df['month'].apply(lambda x: calendar.month_abbr[x])

    df['wd4'] = df['date'].dt.weekday == 4
    df['wd56'] = df['date'].dt.weekday >= 5
#     df['wd6'] = df['date'].dt.weekday >= 6
#     df.loc[(df.date.dt.year != 2016) & (df.date.dt.month >=3), 'day_of_year'] += 1 # fix for leap years
    
    # 21 days cyclic for lunar
    dayofyear = df.date.dt.dayofyear # for convenience
    
    # here he's creating Fourier features
    for k in range(1, 32, 4):
        df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        df[f'finland_sin{k}'] = np.where(df['country'] == 'Finland', df[f'sin{k}'], 0)
        df[f'finland_cos{k}'] = np.where(df['country'] == 'Finland', df[f'cos{k}'], 0)
        df[f'norway_sin{k}'] = np.where(df['country'] == 'Norway', df[f'sin{k}'], 0)
        df[f'norway_cos{k}'] = np.where(df['country'] == 'Norway', df[f'cos{k}'], 0)
        df[f'store_sin{k}'] = np.where(df['store'] == 'KaggleMart', df[f'sin{k}'], 0)
        df[f'store_cos{k}'] = np.where(df['store'] == 'KaggleMart', df[f'cos{k}'], 0)
        df[f'mug_sin{k}'] = np.where(df['product'] == 'Kaggle Mug', df[f'sin{k}'], 0)
        df[f'mug_cos{k}'] = np.where(df['product'] == 'Kaggle Mug', df[f'cos{k}'], 0)
        df[f'sticker_sin{k}'] = np.where(df['product'] == 'Kaggle Sticker', df[f'sin{k}'], 0)
        df[f'sticker_cos{k}'] = np.where(df['product'] == 'Kaggle Sticker', df[f'cos{k}'], 0)
    
#     df[f'semiweekly_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 14)
#     df[f'semiweekly_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 14)
#     df[f'lunar_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 21)
#     df[f'lunar_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 21)
    df[f'season_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 91.5)
    df[f'season_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 91.5)
#     df = pd.concat([df, pd.DataFrame({f'fin{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Finland')
#                                       for ptr in holidays.Finland(years = [2015,2016,2017,2018,2019]).items()})], axis=1)
#     df = pd.concat([df, pd.DataFrame({f'nor{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Norway')
#                                       for ptr in holidays.Norway(years = [2015,2016,2017,2018,2019]).items()})], axis=1)
#     df = pd.concat([df, pd.DataFrame({f'swe{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Sweden')
#                                       for ptr in holidays.Sweden(years = [2015,2016,2017,2018,2019]).items()})], axis=1)

    # End of year
    # Dec - teckmengwong
    for d in range(24, 32):
        df[f"dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d)
    # I'm unsure of the logic of only doing this for Norway
    for d in range(24, 32):
        df[f"n-dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
    
    # not sure why he's using different date ranges for each country here
    # Jan - teckmengwong
    for d in range(1, 14):
        df[f"f-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
    for d in range(1, 10):
        df[f"n-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
    for d in range(1, 15):
        df[f"s-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
    
    
    # May - tekcmengwong
    for d in list(range(1, 10)): # May Day and after, I guess
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d)
    for d in list(range(19, 26)):
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
    # June 
    for d in list(range(8, 14)):
        df[f"june{d}"] = (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
    
    #Swedish Rock Concert - teckmengwong
    #Jun 3, 2015 – Jun 6, 2015
    #Jun 8, 2016 – Jun 11, 2016
    #Jun 7, 2017 – Jun 10, 2017
    #Jun 6, 2018 – Jun 10, 2018
    #Jun 5, 2019 – Jun 8, 2019
    swed_rock_fest  = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-6')),
                                         2016: pd.Timestamp(('2016-06-11')),
                                         2017: pd.Timestamp(('2017-06-10')),
                                         2018: pd.Timestamp(('2018-06-10')),
                                         2019: pd.Timestamp(('2019-06-8'))})

    df = pd.concat([df, pd.DataFrame({f"swed_rock_fest{d}":
                                      (df.date - swed_rock_fest == np.timedelta64(d, "D")) & (df.country == 'Sweden')
                                      for d in list(range(-3, 3))})], axis=1)

    
    # Last Wednesday of June - teckmengwong
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    for d in list(range(-4, 6)):
        df[f"wed_june{d}"] = (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
        
    # First Sunday of November - teckmengwong
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df = pd.concat([df, pd.DataFrame({f"sun_nov{d}":
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country == 'Norway')
                                      for d in list(range(0, 9))})], axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December) -teckmengwong
    df = pd.concat([df, pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))})], axis=1)
    
    # Easter -teckmengwong
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df = pd.concat([df, pd.DataFrame({f"easter{d}":
                                      (df.date - easter_date == np.timedelta64(d, "D"))
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})], axis=1)
    
    return df

In [25]:
temporal_all_df = temporal_engineering(all_df)

In [26]:
temporal_all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter47,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False


At this point, the `temporal_all_df` DataFrame contains all the time features for both the training and testing sets.
* **Todo**: consider not only adding in holidays from `holidays`, but also borrowing ideas from the AmbrosM Linear notebook too (which creates fewer features, populating them instead with temporal distances from the selected holidays).

### Target Transformation
Now, I'll do the target transformation proposed by @AmbrosM. (I'll do it to the non-encoded DataFrame too, for testing with Prophet and NeuralProphet later.)

In [27]:
for df in [temporal_all_df]:
    df['target'] = np.log(df['num_sold'] / df['gdp']**gdp_exponent)

In [28]:
# encoded_all_df['target'] = np.log(encoded_all_df['num_sold'] / (encoded_all_df['gdp']**gdp_exponent))

In [29]:
temporal_all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,3.738239
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.196010
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,2.925788
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.291321
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.756724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,


### Label Encoding

In [30]:
from sklearn.preprocessing import LabelEncoder
le_dict = {feature: LabelEncoder().fit(orig_train_df[feature]) for feature in ['country', 'product', 'store']}

Now, we'll do the encoding.

In [31]:
encoded_all_df = temporal_all_df.copy()

In [32]:
for feature in ['country', 'product', 'store']:
    encoded_all_df[feature] = le_dict[feature].transform(temporal_all_df[feature])

In [33]:
encoded_all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
0,0,2015-01-01,0,0,1,329.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,3.738239
1,1,2015-01-01,0,0,0,520.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.196010
2,2,2015-01-01,0,0,2,146.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,2.925788
3,3,2015-01-01,0,1,1,572.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.291321
4,4,2015-01-01,0,1,0,911.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.756724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,32863,2019-12-31,2,0,0,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6566,32864,2019-12-31,2,0,2,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6567,32865,2019-12-31,2,1,1,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6568,32866,2019-12-31,2,1,0,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,


### Pseudolabeling

I'm not going to try this right now, but I may return to it later -- I note that Teck Meng Wong had some good results with it.

### Data Splitting, Modification

Now that the preprocessing is done, I'm going to split the data back into the train and test sets; then, I'll create a view on the dataframes that omits the year. The year-less dataframes will be suitable for residual learning.

In [34]:
all_df = encoded_all_df.drop(columns=['num_sold', 'row_id'])

In [35]:
tv_df = all_df[:len(orig_train_df)] # training and validation sets
test_df = all_df[len(orig_train_df):]
# train_df = encoded_all_df.iloc[np.where(encoded_all_df['date'] < '2019-01-01'), :]
# test_df = encoded_all_df[[np.where(encoded_all_df['date'] > '2018-12-31')]]

In [36]:
valid_df = tv_df[tv_df['date'] > '2017-12-31']

In [37]:
train_df = tv_df[tv_df['date'] <= '2017-12-31']

In [38]:
# train_and_valid_residual_df = train_and_valid_df.drop(columns=['date'])
# test_residual_df = test_df.drop(columns=['date'])

In [39]:
len(valid_df) + len(train_df) == len(tv_df)

True

# Training

### Linear Models

To start, I'll just try a few models that Teck Meng Wong has proposed.

In [40]:
from sklearn.linear_model import Ridge, HuberRegressor
from sklearn.neural_network import MLPRegressor

Linear models from Scikit-Learn seemingly require that datetime data be converted to numerics.

In [41]:
import datetime as dt

In [42]:
train_linear_df = train_df.copy()
valid_linear_df = valid_df.copy()
test_linear_df = test_df.copy()

train_linear_df['date'] = train_df['date'].map(dt.datetime.toordinal)
valid_linear_df['date'] = valid_df['date'].map(dt.datetime.toordinal)
test_linear_df['date'] = test_df['date'].map(dt.datetime.toordinal)

#### Ridge

In [43]:
ridge = Ridge()

In [44]:
X = train_linear_df.drop(columns=['target'])
y = train_linear_df['target']

In [45]:
X

Unnamed: 0,date,country,store,product,gdp,month,season,wd4,wd56,sin1,...,easter47,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58
0,735599,0,0,1,5.461456,1,1,False,False,1.721336e-02,...,False,False,False,False,False,False,False,False,False,False
1,735599,0,0,0,5.461456,1,1,False,False,1.721336e-02,...,False,False,False,False,False,False,False,False,False,False
2,735599,0,0,2,5.461456,1,1,False,False,1.721336e-02,...,False,False,False,False,False,False,False,False,False,False
3,735599,0,1,1,5.461456,1,1,False,False,1.721336e-02,...,False,False,False,False,False,False,False,False,False,False
4,735599,0,1,0,5.461456,1,1,False,False,1.721336e-02,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19723,736694,2,0,0,6.295301,12,1,False,True,-2.449294e-16,...,False,False,False,False,False,False,False,False,False,False
19724,736694,2,0,2,6.295301,12,1,False,True,-2.449294e-16,...,False,False,False,False,False,False,False,False,False,False
19725,736694,2,1,1,6.295301,12,1,False,True,-2.449294e-16,...,False,False,False,False,False,False,False,False,False,False
19726,736694,2,1,0,6.295301,12,1,False,True,-2.449294e-16,...,False,False,False,False,False,False,False,False,False,False


In [46]:
%time
ridge.fit(X, y)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.48 µs


Ridge()

In [47]:
X_valid = valid_linear_df.drop(columns=['target'])
y_valid = valid_linear_df['target']

In [48]:
ridge_valid_preds = ridge.predict(X_valid)

In [49]:
SMAPE(y_pred=ridge_valid_preds, y_true=y_valid)

4.297207111341678

That's not bad.

#### Lasso

In [50]:
from sklearn.linear_model import Lasso

In [52]:
lasso_model = Lasso()

In [53]:
lasso_model.fit(X,y)

Lasso()

In [55]:
lasso_preds = lasso_model.predict(X_valid)

In [57]:
SMAPE(y_pred=lasso_preds, y_true=y_valid)

14.719972647663342

Not so good there.

#### Huber

In [58]:
huber_model = HuberRegressor()

In [59]:
huber_model.fit(X,y)

HuberRegressor()

In [60]:
huber_valid_preds = huber_model.predict(X_valid)

In [61]:
SMAPE(y_pred=huber_valid_preds, y_true=y_valid)

14.808248729247591

That's not so great.

#### MLPRegressor

In [63]:
mlp_model = MLPRegressor(hidden_layer_sizes=(200,100), learning_rate_init=0.01, early_stopping=True, max_iter=300, random_state=42, learning_rate='adaptive')

In [64]:
mlp_model.fit(X,y)

MLPRegressor(early_stopping=True, hidden_layer_sizes=(200, 100),
             learning_rate='adaptive', learning_rate_init=0.01, max_iter=300,
             random_state=42)

In [65]:
mlp_preds = mlp_model.predict(X_valid)

In [66]:
SMAPE(y_pred=mlp_preds, y_true=y_valid)

15.639894056665263

Not great, that. Let's try at least one more set of hyperparams.

In [67]:
mlp_model = MLPRegressor(hidden_layer_sizes=(200,100,50), 
                         learning_rate_init=0.01, 
                         early_stopping=True, 
                         max_iter=300, 
                         random_state=42, 
                         learning_rate='adaptive')

In [68]:
mlp_model.fit(X,y)

MLPRegressor(early_stopping=True, hidden_layer_sizes=(200, 100, 50),
             learning_rate='adaptive', learning_rate_init=0.01, max_iter=300,
             random_state=42)

In [69]:
mlp_preds = mlp_model.predict(X_valid)

In [70]:
SMAPE(y_pred=mlp_preds, y_true=y_valid)

14.763122872881882

Marginally better. Let's try one more layer.

In [71]:
mlp_model = MLPRegressor(hidden_layer_sizes=(200,100,50,25), 
                         learning_rate_init=0.01, 
                         early_stopping=True, 
                         max_iter=300, 
                         random_state=42, 
                         learning_rate='adaptive')

In [72]:
mlp_model.fit(X,y)

MLPRegressor(early_stopping=True, hidden_layer_sizes=(200, 100, 50, 25),
             learning_rate='adaptive', learning_rate_init=0.01, max_iter=300,
             random_state=42)

In [73]:
mlp_preds = mlp_model.predict(X_valid)

In [74]:
SMAPE(y_pred=mlp_preds, y_true=y_valid)

14.862419246640776

A bit worse there.

#### Linear Regression

In [75]:
from sklearn.linear_model import LinearRegression

In [76]:
linreg_model = LinearRegression(fit_intercept=False)

In [77]:
linreg_model.fit(X,y)

LinearRegression(fit_intercept=False)

In [78]:
linreg_preds = linreg_model.predict(X_valid)

In [79]:
SMAPE(y_pred=linreg_preds, y_true=y_valid)

6.752939183524428

That's better

### Pure Time Series (Prophet, NeuralProphet)

Here, it's necessary to break things up by country, store, and product. So we'll hark to @gunesevitan's notebook. 

In [72]:
countries = ['Finland', 'Norway', 'Sweden']
# countries_encoded = [0,1,2]
stores = ['KaggleMart', 'KaggleRama']
# stores_encoded = [0,1]
products = ['Kaggle Hat', 'Kaggle Mug', 'Kaggle Sticker']
# products_encoded = [0,1,2]

In [73]:
# for country in countries_encoded:
#     for store in stores_encoded:
#         for product in products_encoded:
#             train_idx = train_df['country'] 

NameError: name 'countries_encoded' is not defined

In [None]:
train_ts = train_df[['date','target']]

In [None]:
train_ts

In [None]:
valid_ts = valid_df[['date', 'target']]

If I wanted to throw in a Prophet and/or Neural Prophet run here, I could just rename these `ds` and `y` respectively, and proceed.

In [None]:
from prophet import Prophet

In [None]:
prophet_model = Prophet()

In [None]:
train_ts = train_ts.rename(columns={'date': 'ds',
                        'target': 'y'}
               )

In [None]:
train_ts

In [None]:
prophet_model.fit(df=train_ts)

In [None]:
valid_ts = valid_ts.rename(columns={'date': 'ds',
                        'target': 'y'}
               )

In [None]:
train_preds = prophet_model.predict(train_ts[['ds']])['yhat']
valid_preds = prophet_model.predict(valid_ts[['ds']])['yhat']

In [None]:
SMAPE(train_ts['y'].values, train_preds.values)

In [None]:
SMAPE(valid_preds.values, valid_ts['y'].values)

In [None]:
valid_preds 

In [None]:
valid_ts['y']

### GBMs

Now, I'm going to try training the GBMs, stripping out the years. I won't -- yet -- make them residual-only, however. I'll double back to the `tv_df`, pre-validation splitting.

In [74]:
tv_df_gbm = tv_df.drop(columns=['date'])

In [76]:
tv_df_gbm.columns

Index(['country', 'store', 'product', 'gdp', 'month', 'season', 'wd4', 'wd56',
       'sin1', 'cos1',
       ...
       'easter50', 'easter51', 'easter52', 'easter53', 'easter54', 'easter55',
       'easter56', 'easter57', 'easter58', 'target'],
      dtype='object', length=244)

Going to bring over my old cross-trainer function, give it a whirl.

In [47]:
def cross_validate_model(arch:str, X=X, y=y, X_test=X_test, model_params:dict={}, training_params=training_params, dataset_params=dataset_params,
                         folds=list(range(folds)), exmodel_config=exmodel_config, wandb_config=wandb_config,  telegram=True, random_state=42, 
                         wandb_tracked=True, encode_cats=False):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
    # if exmodel_config['kfolds'] == 1: # holdout case
    #     print("Proceeding with holdout")
    #     X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, 
    #                                                           random_state=SEED)                 
    # else: # k-fold cross validation case
    #     # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    #     # shuffle on the initial sets, but not subsequently -- performing the same operation twice means a very different dataset
    #     if shuffle_kfolds:
    #         kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    #     else:
    #         kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)
    
    kfold = training_params['cross_val_strategy']
    zz
    if wandb_tracked:
        exmodel_config['arch'] = arch
        exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202112_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    
    test_preds = np.zeros((X_test.shape[0]))
    # test_probs = np.zeros((X_test.shape[0]))
    # preprocessing
    # if using a GBM, simply use the RobustScaler
        # scaler = RobustScaler()
        # X = scaler.fit_transform(X)
        # X_test = scaler.transform(X_test)
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        if fold not in folds: # skip folds that are already trained, i.e. that haven't been specified
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            y_train, y_valid = y[train_ids], y[valid_ids] # y will be an np.ndarray already; handling will be same regardless of model
            if isinstance(X, np.ndarray):
                X_train, X_valid = X[train_ids], X[valid_ids]
            else:
                X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:] # bc need pandas.DataFrames for ce
                
                # scaling
                # category_encoding
                # if encode_cats:
                #     encoder = ce.WOEEncoder(cols=categoricals)
                #     encoder.fit(X_train,y_train)
                #     X_train = encoder.transform(X_train)
                #     X_valid = encoder.transform(X_valid)
                # # exmodel_config['feature_count'] = len(X.columns)
                #     wandb.log({
                #         'feature_count': X_train.shape[1],
                #         'instance_count': X_train.shape[0],
                #         'encoder': str(encoder)
                #     })
        
        # define models
        if arch == 'xgboost':
            model = XGBClassifier(
                booster='gbtree',
                tree_method='gpu_hist',
                random_state=random_state,
                n_jobs=-1, 
                verbosity=1, 
                objective='binary:logistic',
                **model_params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
            else:
                model.fit(X_train, y_train)
            
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict(X_test)
            # test_probs += model.predict_proba(X_test)


        elif arch == 'lightgbm':
            # try:
            model = LGBMClassifier(
                objective='binary',
                random_state=random_state,
#                     device_type='cpu',
#                     n_jobs=-1,
#                 eval_metric='auc',
                device_type='gpu',
                max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
                gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
                **model_params)

            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
            else:
                model.fit(X_train, y_train)
#             except LightGBMError:
#                 model = LGBMClassifier(
#                     objective='binary',
#                     random_state=random_state,
#                     device_type='cpu',
#                     n_jobs=-1,
#     #                 eval_metric='auc',
#     #                 device_type='gpu',
#     #                 max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
#     #                 gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
#                     **params)
                
#                 if wandb_tracked:
#                     model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
#                 else:
#                     model.fit(X_train, y_train)
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict(X_test)
            # test_probs += model.predict_proba(X_test)[:,1]

            
        elif arch == 'catboost':
            model = CatBoostClassifier(
                task_type='GPU',
                silent=True,
                random_state=random_state,
                **model_params) 
        
            model.fit(X_train, y_train)
            
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)[:,1] # this would only take one of 7 cols
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict(X_test).flatten()
            # test_probs += model.predict_proba(X_test)[:,1]
            
#         valid_loss = log_loss(y_valid, y_pred)
        # give the valid AUC score, for edification

        fold_accuracy = accuracy_score(y_true=y_valid, y_pred=y_valid_preds) # or should be preds?
        # fold_confusion = confusion_matrix(y_true=y_valid, y_pred=y_valid_preds)# , labels=list(range(7)))
        # fold_log_loss = log_loss(y_pred=y_valid_preds, y_true=y_valid,) #labels=list(range(7)))
        # fold_roc_auc = roc_auc_score(y_true=y_valid, y_score=y_valid_probs)
        # fold_f1_score = f1_score(
        # fold_fbeta_score = fbeta_score(
        
        if wandb_tracked:
            wandb.log({f'fold{fold}_accuracy': fold_accuracy,
                       # f'fold{fold}_confusion': fold_confusion,
                       # f'fold{fold}_log_loss': fold_log_loss,
                       # f'fold{fold}_roc_auc': fold_roc_auc,
                      })
        fold_human_results = f"{os.environ['WANDB_NOTEBOOK_NAME']}\nMetrics for fold {fold} are: \nAccuracy: {fold_accuracy}"
        print(fold_human_results)
        if telegram:
            send_tg_message(text=f"{arch} model's fold {fold} complete.\n"+fold_human_results)
        # dump(model, Path(runpath/f"{arch}_fold{fold}_rs{random_state}_model.joblib"))

    model_accuracy = accuracy_score(y_true=oof_y, y_pred=oof_preds) 
    # model_confusion = confusion_matrix(y_true=oof_y, y_pred=oof_preds, labels=list(range(7)))
    # model_log_loss = log_loss(y_pred=oof_preds, y_true=oof_y, labels=list(range(7)))
    # model_valid_auc = roc_auc_score(oof_y, oof_preds)
    model_human_results = f"{os.environ['WANDB_NOTEBOOK_NAME']}\nMetrics for model {arch} are: \nAccuracy: {model_accuracy}"
    print(model_human_results)
    if telegram:
        send_tg_message(text=f"{arch} model run complete.\n"+model_human_results)
    if wandb_tracked:
        wandb.log({f'model_accuracy': fold_accuracy,
                   # f'model_confusion': fold_confusion,
                   # f'model_log_loss': fold_log_loss,
                   # f'model_roc_auc': fold_roc_auc,
                   'model_params': str(model.get_params()),
                   'model_seed': random_state,
                  })
        wandb.finish()
    
    # finalize test preds
    # test_probs /= exmodel_config['kfolds']
    # test_preds /= exmodel_config['kfolds']
    
    
    # save OOF preds and test-set preds
#     if 'widedeep' in arch:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_test_preds.joblib"))
    
#     else:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    
    # if not (datapath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib").is_file():
    #     dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
#     if wandb_tracked:
# #         if 'widedeep' in arch:
#         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
#                    'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()), 
#         #                    'model_params': str(model.get_params()),
#         })
# #         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
# # #                    'model_params': str(model.get_params()),
# #                   })
#         wandb.finish()
    return oof_preds, test_preds#, model_confusion
        

  and should_run_async(code)


# Old stuff

## Training Params

In [16]:
# training_params = {
#     'general_random_state': SEED,
# }

# folds = 5
# training_params['cross_val_strategy'] = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)

## Model Params

## Metadata

In [None]:
# # baseline -- alter as needed later
# exmodel_config = {
#     'general_random_state': SEED,
# #     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     **dataset_params,
# #     **training_params,
# #     **model_params # perhaps do later
# }

## WandB Config

In [None]:
# # wandb config:
# wandb_config = {
#     'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
#     'tags': ['EDA'],
#     'notes': "EDA"
# }