# Hybrid
Going to attempt a hybrid model after the example of [this Teck Meng Wong notebook](https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series/notebook).

- 20220122: Going to try to form ensembles, with more code architecture. Forecasting models will include Prophet, NeuralProphet, Ridge, and Linear (with more to come -- e.g. perhaps transformers and other DNNs); residual models will include GBMs, perhaps some tabular DNNs too.

In [1]:
# notebook configuration
# if '/sf/' in pwd:
#     COLAB, SAGE = False, False
# elif 'google.colab' in str(get_ipython()):
#     COLAB, SAGE = True, False # do colab-specific installs later
# else:
#     COLAB, SAGE = False, True
    
CONTEXT = 'local' # or 'colab', 'sage', 'kaggle'
USE_GPU = True 
%config Completer.use_jedi = False

## Imports

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import requests # for telegram notifications
from tqdm.notebook import tqdm

from joblib import dump, load

Now, non-stdlib imports

In [3]:
# model selection
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

# metrics
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

# normalization
# from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
# from gauss_rank_scaler import GaussRankScaler

# feature generation
# import category_encoders as ce

# models -- will be imported JIT
# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
# from pytorch_widedeep import Trainer
# from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
# from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT#, TabTransformer, TabNet, TabFastFormer, TabResnet
# from pytorch_widedeep.metrics import Accuracy
# from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

# feature reduction
# from sklearn.decomposition import PCA
# from umap import UMAP

# clustering
# from sklearn.cluster import DBSCAN, KMeans
# import hdbscan

# feature selection
# from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
# import featuretools as ft
# from BorutaShap import BorutaShap
# from boruta import BorutaPy

# tracking 
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
os.environ['WANDB_NOTEBOOK_NAME'] = f"nb_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# # time series
# import tsfresh

# import darts
# from darts import TimeSeries
# from darts.models import ExponentialSmoothing, AutoARIMA, ARIMA, Prophet, RandomForest, RegressionEnsembleModel, RegressionModel, TFTModel, TCNModel, TransformerModel, NBEATSModel
import holidays
import dateutil.easter as easter
from prophet import Prophet
from neuralprophet import NeuralProphet

## Routing

Now, datapath setup

In [5]:
if CONTEXT == 'colab':
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    # datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/dec2021/')
    root = Path('') # TODO

elif CONTEXT == 'sage':
    root = Path('') # TODO
    
elif CONTEXT == 'kaggle':
    root = Path('') # TODO
    
else: # if on local machine
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    studypath = root/'studies'
    
    for pth in [datapath, predpath, subpath, studypath]:
        pth.mkdir(exist_ok=True)

## Helpers

In [6]:
SEED = 42

# Function to seed everything but the models
def seed_everything(seed, pytorch=True, reproducible=True):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if pytorch:
        torch.manual_seed(seed) # set torch CPU seed
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed) # set torch GPU(s) seed(s)
        if reproducible and torch.backends.cudnn.is_available():
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

seed_everything(seed=SEED)

In [7]:
def reduce_memory_usage(df, verbose=True):
    """
    Function to reduce memory usage by downcasting datatypes in a Pandas DataFrame when possible.
    
    h/t to Bryan Arnold (https://www.kaggle.com/puremath86/label-correction-experiments-tps-nov-21)
    """
    
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [8]:
tg_api_token = 'your_api_token' # for Galileo (jupyter_watcher_bot) on Telegram
tg_chat_id = 'your_chat_id'

import requests

def send_tg_message(text='Cell execution completed.'):  
    """
    h/t Ivan Dembicki Jr. for the base version 
    (https://medium.com/@ivan.dembicki.jr/notifications-in-jupyter-notebook-with-telegram-f2e892c55173)
    """
    requests.post('https://api.telegram.org/' +  'bot{}/sendMessage'.format(tg_api_token),
                  params=dict(chat_id=tg_chat_id, text=text))

In [9]:
def SMAPE(y_true, y_pred):
    '''
    h/t Jean-François Puget (@CPMP) -- see https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
    '''
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [10]:
# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/282735
def better_than_median(inputs, axis):
    """Compute the mean of the predictions if there are no outliers,
    or the median if there are outliers.

    Parameter: inputs = ndarray of shape (n_samples, n_folds)"""
    spread = inputs.max(axis=axis) - inputs.min(axis=axis) 
    spread_lim = 0.45
    print(f"Inliers:  {(spread < spread_lim).sum():7} -> compute mean")
    print(f"Outliers: {(spread >= spread_lim).sum():7} -> compute median")
    print(f"Total:    {len(inputs):7}")
    return np.where(spread < spread_lim,
                    np.mean(inputs, axis=axis),
                    np.median(inputs, axis=axis))

In [11]:
# https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series
def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

In [12]:
# https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series
def fourier_features(index, freq, order):
    time = np.arange(len(index), dtype=np.float32)
    k = 2 * np.pi * (1 / freq) * time
    features = {}
    for i in range(1, order + 1):
        features.update({
            f"sin_{freq}_{i}": np.sin(i * k),
            f"cos_{freq}_{i}": np.cos(i * k),
        })
    return pd.DataFrame(features, index=index)

## Dataset Setup

### Original Data Loading

In [13]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    'train_source': str(datapath/'train.csv'),
    'target_source': str(datapath/'train.csv'),
    'test_source': str(datapath/'test.csv'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
train_df = pd.read_csv(datapath/'train.csv')
test_df = pd.read_csv(datapath/'test.csv')
orig_train_df = train_df.copy()
orig_test_df = test_df.copy()

Since the dates are natively `Object` dtype (i.e. strings), we have to convert them:

In [14]:
# https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model
for df in [train_df, test_df]:
    df['date'] = pd.to_datetime(df.date)

# for convenience later
countries = ['Sweden', 'Finland', 'Norway']
stores = ['KaggleMart', 'KaggleRama']
products = ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']

Provisionally, I'm going to concatenate together the `train_df` and `test_df` for preprocessing, to avoid having to constantly apply transforms twice (since I don't anticipate doing any transforms that might allow data leakage to occur).

In [15]:
all_df = pd.concat([train_df, test_df], axis=0)
# all_df.columns
len(all_df) == len(train_df) + len(test_df)

True

### GDP Data
Here's data from Carl McBride Ellis ([notebook](https://www.kaggle.com/carlmcbrideellis/gdp-of-finland-norway-and-sweden-2015-2019) and [dataset](https://www.kaggle.com/carlmcbrideellis/gdp-20152019-finland-norway-and-sweden) for doing GDP comparisons. They're frequently used in other entries. I've created a function to add them on.

In [16]:
def add_gdp_data(df):
    gdp_df = pd.read_csv(datapath/'GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
    gdp_df.set_index('year', inplace=True)
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]

    df['gdp'] = np.log1p(df.apply(get_gdp, axis=1))
    return df

I'll also define here (but perhaps move later) the GDP exponent, which will be used to transform the targets before inference (dividing num_sold by the $GDP^{1.212}$ and then taking the logarithm (after @ambrosm)

In [17]:
gdp_exponent = 1.2121103201489674 # see https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model for an explanation

In [18]:
all_df = add_gdp_data(all_df)

In [19]:
all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329.0,5.461456
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520.0,5.461456
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146.0,5.461456
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572.0,5.461456
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911.0,5.461456
...,...,...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,,6.282042
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,,6.282042
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,,6.282042
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,,6.282042


## Feature Engineering

### Time Features

The goal of this function is to create features that will capture seasonalities -- but **not** trends. The trends will (hopefully) be captured by the deployment of linear forecasting algorithms on raw time series data (consisting exclusively of dates and targets); we want to have seasonalities that the residual models can learn, however -- holidays, weekly patterns, climactic season patterns, etc.

The cell below will generate the `holidays` library's entries for the three countries. I may want to follow the template of @teckmengwong's code below, and add more holidays -- then, do some feature importance checking, and perhaps whittle down the features accordingly.

In [20]:
for c in [holidays.Finland, holidays.Sweden, holidays.Norway]:
#     print(c)
    for h in c(years = [2019], observed=True).items():
#         print(h)
        pass

In [21]:
def temporal_engineering(df):
    '''
    Function inspired by / borrowing from @teckmengwong and @ambrosm to create time features that will
    capture seasonality.
    '''
    
#     df[YEAR] = df[DATE].dt.year
    df['month'] = df['date'].dt.month
#     df['week'] = df['date'].dt.week # not used by Teck Meng Wong
#     df['day'] = df['date'].dt.day # not used by Teck Meng Wong
#     df['day_of_year'] = df['date'].dt.dayofyear # not used by Teck Meng Wong
#     df['day_of_month'] = df['date'].dt.days_in_month # not used by Teck Meng Wong
#     df['day_of_week'] = df['date'].dt.dayofweek # not used by Teck Meng Wong
#    df['weekday'] = df['date'].dt.weekday # not used by Teck Meng Wong
    # Teck Meng Wong mapped the integers to first-letters in triplets
    # I'm leaving it as integers, where winter=1, spring=2, summer=3, fall=4
    df['season'] = ((df['date'].dt.month % 12 + 3) // 3) #.map({1:'DJF', 2: 'MAM', 3:'JJA', 4:'SON'})
#     df['month'] = df['month'].apply(lambda x: calendar.month_abbr[x])

    df['wd4'] = df['date'].dt.weekday == 4
    df['wd56'] = df['date'].dt.weekday >= 5
#     df['wd6'] = df['date'].dt.weekday >= 6
#     df.loc[(df.date.dt.year != 2016) & (df.date.dt.month >=3), 'day_of_year'] += 1 # fix for leap years
    
    # 21 days cyclic for lunar
    dayofyear = df.date.dt.dayofyear # for convenience
    
    # here he's creating Fourier features
    for k in range(1, 32, 4):
        df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        df[f'finland_sin{k}'] = np.where(df['country'] == 'Finland', df[f'sin{k}'], 0)
        df[f'finland_cos{k}'] = np.where(df['country'] == 'Finland', df[f'cos{k}'], 0)
        df[f'norway_sin{k}'] = np.where(df['country'] == 'Norway', df[f'sin{k}'], 0)
        df[f'norway_cos{k}'] = np.where(df['country'] == 'Norway', df[f'cos{k}'], 0)
        df[f'store_sin{k}'] = np.where(df['store'] == 'KaggleMart', df[f'sin{k}'], 0)
        df[f'store_cos{k}'] = np.where(df['store'] == 'KaggleMart', df[f'cos{k}'], 0)
        df[f'mug_sin{k}'] = np.where(df['product'] == 'Kaggle Mug', df[f'sin{k}'], 0)
        df[f'mug_cos{k}'] = np.where(df['product'] == 'Kaggle Mug', df[f'cos{k}'], 0)
        df[f'sticker_sin{k}'] = np.where(df['product'] == 'Kaggle Sticker', df[f'sin{k}'], 0)
        df[f'sticker_cos{k}'] = np.where(df['product'] == 'Kaggle Sticker', df[f'cos{k}'], 0)
    
#     df[f'semiweekly_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 14)
#     df[f'semiweekly_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 14)
#     df[f'lunar_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 21)
#     df[f'lunar_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 21)
    df[f'season_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 91.5)
    df[f'season_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 91.5)
#     df = pd.concat([df, pd.DataFrame({f'fin{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Finland')
#                                       for ptr in holidays.Finland(years = [2015,2016,2017,2018,2019]).items()})], axis=1)
#     df = pd.concat([df, pd.DataFrame({f'nor{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Norway')
#                                       for ptr in holidays.Norway(years = [2015,2016,2017,2018,2019]).items()})], axis=1)
#     df = pd.concat([df, pd.DataFrame({f'swe{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Sweden')
#                                       for ptr in holidays.Sweden(years = [2015,2016,2017,2018,2019]).items()})], axis=1)

    # End of year
    # Dec - teckmengwong
    for d in range(24, 32):
        df[f"dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d)
    # I'm unsure of the logic of only doing this for Norway
    for d in range(24, 32):
        df[f"n-dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
    
    # not sure why he's using different date ranges for each country here
    # Jan - teckmengwong
    for d in range(1, 14):
        df[f"f-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
    for d in range(1, 10):
        df[f"n-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
    for d in range(1, 15):
        df[f"s-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
    
    
    # May - tekcmengwong
    for d in list(range(1, 10)): # May Day and after, I guess
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d)
    for d in list(range(19, 26)):
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
    # June 
    for d in list(range(8, 14)):
        df[f"june{d}"] = (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
    
    #Swedish Rock Concert - teckmengwong
    #Jun 3, 2015 – Jun 6, 2015
    #Jun 8, 2016 – Jun 11, 2016
    #Jun 7, 2017 – Jun 10, 2017
    #Jun 6, 2018 – Jun 10, 2018
    #Jun 5, 2019 – Jun 8, 2019
    swed_rock_fest  = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-6')),
                                         2016: pd.Timestamp(('2016-06-11')),
                                         2017: pd.Timestamp(('2017-06-10')),
                                         2018: pd.Timestamp(('2018-06-10')),
                                         2019: pd.Timestamp(('2019-06-8'))})

    df = pd.concat([df, pd.DataFrame({f"swed_rock_fest{d}":
                                      (df.date - swed_rock_fest == np.timedelta64(d, "D")) & (df.country == 'Sweden')
                                      for d in list(range(-3, 3))})], axis=1)

    
    # Last Wednesday of June - teckmengwong
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    for d in list(range(-4, 6)):
        df[f"wed_june{d}"] = (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
        
    # First Sunday of November - teckmengwong
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df = pd.concat([df, pd.DataFrame({f"sun_nov{d}":
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country == 'Norway')
                                      for d in list(range(0, 9))})], axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December) -teckmengwong
    df = pd.concat([df, pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))})], axis=1)
    
    # Easter -teckmengwong
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df = pd.concat([df, pd.DataFrame({f"easter{d}":
                                      (df.date - easter_date == np.timedelta64(d, "D"))
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})], axis=1)
    
    return df

In [22]:
temporal_all_df = temporal_engineering(all_df)

In [23]:
temporal_all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter47,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False


At this point, the `temporal_all_df` DataFrame contains all the time features for both the training and testing sets.
* **Todo**: consider not only adding in holidays from `holidays`, but also borrowing ideas from the AmbrosM Linear notebook too (which creates fewer features, populating them instead with temporal distances from the selected holidays).

### Target Transformation
Now, I'll do the target transformation proposed by @AmbrosM. (I'll do it to the non-encoded DataFrame too, for testing with Prophet and NeuralProphet later.)

In [24]:
for df in [temporal_all_df]:
    df['target'] = np.log(df['num_sold'] / df['gdp']**gdp_exponent)

In [25]:
# encoded_all_df['target'] = np.log(encoded_all_df['num_sold'] / (encoded_all_df['gdp']**gdp_exponent))

In [26]:
temporal_all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,3.738239
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.196010
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,2.925788
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.291321
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.756724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,


### Label Encoding

In [27]:
from sklearn.preprocessing import LabelEncoder
le_dict = {feature: LabelEncoder().fit(orig_train_df[feature]) for feature in ['country', 'product', 'store']}

In [28]:
# for key in le_dict.keys():
#     print(f"Values for key {key} are {le_dict[key].inverse_transform(range(len(le_dict[key].values())))}")#"
print(le_dict['country'].inverse_transform([0,1,2]))
print(le_dict['product'].inverse_transform([0,1,2]))
print(le_dict['store'].inverse_transform([0,1]))

['Finland' 'Norway' 'Sweden']
['Kaggle Hat' 'Kaggle Mug' 'Kaggle Sticker']
['KaggleMart' 'KaggleRama']


Now, we'll do the encoding.

In [29]:
encoded_all_df = temporal_all_df.copy()

In [30]:
for feature in ['country', 'product', 'store']:
    encoded_all_df[feature] = le_dict[feature].transform(temporal_all_df[feature])

At this point, the `encoded_all_df` can be used -- perhaps with a call to `LabelEncoder.inverse_transform` -- to recover the "original" data when necessary (e.g. for feeding it into Prophet and NeuralProphet)

In [31]:
encoded_all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
0,0,2015-01-01,0,0,1,329.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,3.738239
1,1,2015-01-01,0,0,0,520.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.196010
2,2,2015-01-01,0,0,2,146.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,2.925788
3,3,2015-01-01,0,1,1,572.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.291321
4,4,2015-01-01,0,1,0,911.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.756724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,32863,2019-12-31,2,0,0,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6566,32864,2019-12-31,2,0,2,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6567,32865,2019-12-31,2,1,1,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6568,32866,2019-12-31,2,1,0,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,


### Pseudolabeling

I'm not going to try this right now, but I may return to it later -- I note that Teck Meng Wong had some good results with it.

### Data Splitting, Modification

Now that the preprocessing is done, I'm going to split the data back into the train and test sets; then, I'll create a view on the dataframes that omits the year. The year-less dataframes will be suitable for residual learning.

In [32]:
# all_df = encoded_all_df.drop(columns=['num_sold', 'row_id'])
all_df = encoded_all_df.drop(columns=['row_id'])

In [33]:
tv_df_encoded = all_df[:len(orig_train_df)] # training and validation sets
test_df_encoded = all_df[len(orig_train_df):]


In [34]:
# train_df = encoded_all_df.iloc[np.where(encoded_all_df['date'] < '2019-01-01'), :]
# test_df = encoded_all_df[[np.where(encoded_all_df['date'] > '2018-12-31')]]

# encoded_tv_df = encoded_all_df.drop(columns=['row_id'])[:len(orig_train_df)]
# encoded_test_df = encoded_all_df.drop(columns=['row_id'])[len(orig_train_df):]

# valid_df = tv_df[tv_df['date'] > '2017-12-31']
# train_df = tv_df[tv_df['date'] <= '2017-12-31']

# train_and_valid_residual_df = train_and_valid_df.drop(columns=['date'])
# test_residual_df = test_df.drop(columns=['date'])

# len(valid_df) + len(train_df) == len(tv_df)

# encoded_tv_df

# Training

### Forecasting Models Prep
First, we'll set up functions to handle the training of forecasting models which will discern trends, and which may -- or may not -- yield insights concerning seasonality. While the Scikit-Learn models will be able to share a single trainer function, the Prophet and NeuralProphet models have subtly different expectations of their data, and as such will require separate handling.

In [35]:
from sklearn.linear_model import Ridge, HuberRegressor, LinearRegression, Lasso
from sklearn.neural_network import MLPRegressor
from prophet import Prophet
from neuralprophet import NeuralProphet
# earth? wouldn't install via pip on my machine at first

#### (Preprepared Preds)

The next cell contains code to import already-existing predictions -- but I think it's better to centralize the code that produces them here, and will comment out the import code for now.

In [36]:
# prophet_trainset = load(predpath/'20220121_prophet_baseline_trainset.joblib')

# neural_trainset = load(predpath/'20220121_neuralprophet_baseline_trainset.joblib')
# neural_test_preds = load(predpath/'20220121_neuralprophet_baseline_testset.joblib')

# ridge_tv_preds = load(predpath/'20210121_ridge_baseline_trainset_preds.joblib')
# ridge_test_preds = load(predpath/'20220121_ridge_testset_preds.joblib')

And this cell would handle the parsing

In [37]:
# neural_tv_preds = neural_trainset['prophet_forecast']
# prophet_tv_preds = prophet_trainset['prophet_forecast']

# neural_train_preds = neural_tv_preds[:train_length]
# neural_valid_preds = neural_tv_preds[train_length:]

# prophet_train_preds = prophet_tv_preds[:train_length]
# prophet_valid_preds = prophet_tv_preds[train_length:]

# train_length = len(neural_trainset[neural_trainset['date'] <= '2017-12-31'])

# ridge_train_preds = ridge_tv_preds[:train_length]
# ridge_valid_preds = ridge_tv_preds[train_length:]

#### Scikit-Learn Linear Models Prep

Linear models from Scikit-Learn seemingly require that datetime data be converted to numerics.

In [38]:
import datetime as dt

In [39]:
# train_linear_df = train_df.copy()
# valid_linear_df = valid_df.copy()
# test_linear_df = test_df.copy()
# tv_linear_df = tv_df.copy()



### Forecasters

#### Hyperparameters
I'll hard-code them for now, but in the future may Optuna them. May want to create a dict of all the kwargs to be used for all the models, with the model names as keys

In [40]:
prophet_kwargs = {
    'growth':'linear',
#     'holidays':holidays_train, # will add this in-function
    'n_changepoints':10,
    'changepoint_range':0.4,
    'yearly_seasonality':True,
    'weekly_seasonality':True,
    'daily_seasonality':False,
    'seasonality_mode':'additive',
    'seasonality_prior_scale':25,
    'holidays_prior_scale':100,
    'changepoint_prior_scale':0.01,
    'interval_width':0.5,
    'uncertainty_samples':False
}

neuralprophet_kwargs = {
    'growth':'linear',
    'n_changepoints':10,
    'changepoints_range':0.4,
    'trend_reg':1,
    'trend_reg_threshold':False,
    'yearly_seasonality':True,
    'weekly_seasonality':True,
    'daily_seasonality':False,
    'seasonality_mode':'additive',
    'seasonality_reg':1,
    'n_forecasts':365,
    'normalize':'off'
}

# model_params['hyperparams'] = str(neuralprophet_kwargs)
# model_params['holiday_source'] = 'Prophet builtin for each country'

#### Trainers

##### NeuralProphet
I'm leaving the folds as they are. ~~Label encoding shouldn't matter -- the values are just being iterated over anyway.~~ It does matter because the Prophets use the strings to identify countries' holidays to add. Not sure about doing the target transform -- if you try it, just have the trainer call pass `target='target'`.

In [41]:
prophet_folds = folds = [
    ('2015-01-01', '2018-01-01'),
    ('2018-01-01', '2019-01-01'),
]

In [42]:
prophet_tv_df = tv_df_encoded.copy() # encoded_tv_df.copy()
prophet_test_df = test_df_encoded.copy() # encoded_test_df.copy()

In [43]:
for feature in ['country', 'product', 'store']:
    prophet_tv_df[feature] = orig_train_df[feature]
    prophet_test_df[feature] = orig_test_df[feature]


In [44]:
prophet_tv_df.head()

Unnamed: 0,date,country,store,product,num_sold,gdp,month,season,wd4,wd56,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329.0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,3.738239
1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520.0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,4.19601
2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146.0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,2.925788
3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572.0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,4.291321
4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911.0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,4.756724


In [45]:
countries_enc = le_dict['country'].transform(countries)
stores_enc = le_dict['store'].transform(stores)
products_enc = le_dict['product'].transform(products)

countries, countries_enc

(['Sweden', 'Finland', 'Norway'], array([2, 0, 1]))

In [46]:
def neuralprophet_trainer(model_kwargs=neuralprophet_kwargs, countries=countries, stores=stores, products=products, folds=prophet_folds, 
                          df_train=prophet_tv_df, df_test=prophet_test_df, target='num_sold', wandb_tracked=False):
    train_smape = 0
    val_smape = 0
    
    if wandb_tracked:
#         exmodel_config['arch'] = arch
#         exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202201_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )
    
    # no label encoding here -- but test it with too
    for country in countries:
        for store in stores:
            for product in products:
                for fold, (start, end) in enumerate(folds):
                    # Skip iteration if it's the last fold
                    if fold == len(folds) - 1:
                        continue

                    # put only those rows in that are in the training window and have the correct country, store, and product
                    train_idx = (df_train['date'] >= start) &\
                                (df_train['date'] < end) &\
                                (df_train['country'] == country) &\
                                (df_train['store'] == store) &\
                                (df_train['product'] == product)

                    # redefine the training set in the local (holdout) sense
                    train = df_train.loc[train_idx, ['date', target]].reset_index(drop=True)

                    val_idx = (df_train['date'] >= folds[fold + 1][0]) &\
                              (df_train['date'] < folds[fold + 1][1]) &\
                              (df_train['country'] == country) &\
                              (df_train['store'] == store) &\
                              (df_train['product'] == product)

                    val = df_train.loc[val_idx, ['date', target]].reset_index(drop=True)

                    # rename the columns for standardization (this seems conventional)
                    train = train.rename(columns={'date': 'ds', target: 'y'})
                    val = val.rename(columns={'date': 'ds', target: 'y'})

#                     model = Prophet(**prophet_kwargs)
                    model = NeuralProphet(**model_kwargs)

                    model = model.add_country_holidays(country_name=country) # uses FacebookProphet or NeuralProphet API to add holidays
                    print(train.columns)
                    model.fit(train, freq='D') # neuralprophet
                    # prophet
#                     train_predictions = model.predict(train[['ds']])['yhat']
#                     val_predictions = model.predict(val[['ds']])['yhat']
                    # neuralprophet
                    train_predictions = model.predict(train)['yhat1']
                    val_predictions = model.predict(val)['yhat1']
                    df_train.loc[train_idx, 'neuralprophet_forecast'] = train_predictions.values
                    df_train.loc[val_idx, 'neuralprophet_forecast'] =  val_predictions.values

                    train_score = SMAPE(train['y'].values, train_predictions.values)
                    val_score = SMAPE(val['y'].values, val_predictions.values)
            
                    if wandb_tracked:
                        wandb.log({f"{(country,store,product)}_valid_smape": val_score})
            
                    train_smape += train_score
                    val_smape += val_score
            
                    print(f'\nTraining Range [{start}, {end}) - {country} - {store} - {product} - Train SMAPE: {train_score:4f}')
                    print(f'Validation Range [{folds[fold + 1][0]}, {folds[fold + 1][1]}) - {country} - {store} - {product} - Validation SMAPE: {val_score:4f}\n')

                    test_idx = (df_test['country'] == country) &\
                               (df_test['store'] == store) &\
                               (df_test['product'] == product)
                    test = df_test.loc[test_idx, ['date']].reset_index(drop=True)
                    
                    test = test.rename(columns={'date': 'ds'})
                    test['y'] = np.nan
                    test_predictions = model.predict(test)['yhat1']
                    
                    
                    df_test.loc[test_idx, 'neuralprophet_forecast'] = test_predictions.values
    
    train_smape /= (3*2*3)
    val_smape /= (3*2*3)
#     train_
    
    if wandb_tracked:
        wandb.log({'overall_train_smape': train_smape, 'overall_valid_smape': val_smape})
        wandb.finish()
    return df_train['neuralprophet_forecast'], df_test['neuralprophet_forecast']#, train_smape, val_smape

##### Prophet Trainer

In [47]:
def prophet_trainer(prophet_kwargs=prophet_kwargs, countries=countries, stores=stores, products=products, folds=prophet_folds, 
                    df_train=prophet_tv_df, df_test=prophet_test_df, target='num_sold', wandb_tracked=False):
    train_smape = 0
    val_smape = 0
    
    if wandb_tracked:
#         exmodel_config['arch'] = arch
#         exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202201_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )
    
    for country in countries:
        for store in stores:
            for product in products:
                for fold, (start, end) in enumerate(folds):
                    # Skip iteration if it's the last fold
                    if fold == len(folds) - 1:
                        continue

                    # put only those rows in that are in the training window and have the correct country, store, and product
                    train_idx = (df_train['date'] >= start) &\
                                (df_train['date'] < end) &\
                                (df_train['country'] == country) &\
                                (df_train['store'] == store) &\
                                (df_train['product'] == product)
                    
#                     print(train_idx)

                    # redefine the training set in the local (holdout) sense
                    train = df_train.loc[train_idx, ['date', target]].reset_index(drop=True)
#                     print(train.shape)

                    val_idx = (df_train['date'] >= folds[fold + 1][0]) &\
                              (df_train['date'] < folds[fold + 1][1]) &\
                              (df_train['country'] == country) &\
                              (df_train['store'] == store) &\
                              (df_train['product'] == product)

                    val = df_train.loc[val_idx, ['date', target]].reset_index(drop=True)

                    # rename the columns for standardization (this seems conventional)
                    train = train.rename(columns={'date': 'ds', target: 'y'})
                    val = val.rename(columns={'date': 'ds', target: 'y'})

                    model = Prophet(**prophet_kwargs)
#                                         growth='linear',
#                     #                     holidays=holidays_train,
#                                         n_changepoints=10,
#                                         changepoint_range=0.4,
#                                         yearly_seasonality=True,
#                                         weekly_seasonality=True,
#                                         daily_seasonality=False,
#                                         seasonality_mode='additive',
#                                         seasonality_prior_scale=25,
#                                         holidays_prior_scale=100,
#                                         changepoint_prior_scale=0.01,
#                                         interval_width=0.5,
#                                         uncertainty_samples=False
#                                     )
                    model.add_country_holidays(country_name=country) # uses FacebookProphet API to add holidays
                    model.fit(train)
        
                    train_predictions = model.predict(train[['ds']])['yhat']
                    val_predictions = model.predict(val[['ds']])['yhat']
                    df_train.loc[train_idx, 'prophet_forecast'] = train_predictions.values
                    df_train.loc[val_idx, 'prophet_forecast'] =  val_predictions.values

                    train_score = SMAPE(train['y'].values, train_predictions.values)
                    val_score = SMAPE(val['y'].values, val_predictions.values)
            
                    if wandb_tracked:
                        wandb.log({f"{(country,store,product)}_valid_smape": val_score})
            
                    train_smape += train_score
                    val_smape += val_score
            
                    print(f'\nTraining Range [{start}, {end}) - {country} - {store} - {product} - Train SMAPE: {train_score:4f}')
                    print(f'Validation Range [{folds[fold + 1][0]}, {folds[fold + 1][1]}) - {country} - {store} - {product} - Validation SMAPE: {val_score:4f}\n')

                    test_idx = (df_test['country'] == country) &\
                               (df_test['store'] == store) &\
                               (df_test['product'] == product)
                    test = df_test.loc[test_idx, ['date']].reset_index(drop=True)
                    
                    test = test.rename(columns={'date': 'ds'})
                    test_predictions = model.predict(test[['ds']])['yhat']
                    
                    
                    df_test.loc[test_idx, 'prophet_forecast'] = test_predictions.values
    
    train_smape /= (3*2*3)
    val_smape /= (3*2*3)
    
    if wandb_tracked:
        wandb.log({'overall_train_smape': train_smape, 'overall_valid_smape': val_smape})
        wandb.finish()
    return df_train['prophet_forecast'], df_test['prophet_forecast']#, train_smape, val_smape

##### Scikit-Learn Models

In [48]:
test_df

Unnamed: 0,row_id,date,country,store,product
0,26298,2019-01-01,Finland,KaggleMart,Kaggle Mug
1,26299,2019-01-01,Finland,KaggleMart,Kaggle Hat
2,26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker
3,26301,2019-01-01,Finland,KaggleRama,Kaggle Mug
4,26302,2019-01-01,Finland,KaggleRama,Kaggle Hat
...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat


In [63]:
def sklearn_trainer(estimator, model_kwargs={}, tv_df=tv_df_encoded, test_df=test_df_encoded, #X=X, y=y, X_valid=X_valid, y_valid=y_valid, X_test=X_test, 
                    folds=prophet_folds, countries=countries_enc, stores=stores_enc, products=products_enc, target='target',
                    by_combo=False, wandb_tracked=False):
    
    
    
    train_smape = 0
    val_smape = 0
    
    if wandb_tracked:
#         exmodel_config['arch'] = arch
#         exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202201_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )
    
    # drop whichever version of the dependent variable is not being used
#     for df in [tv_df, test_df]:
    if target == 'num_sold': 
        tv_df = tv_df.drop(columns=['target'])
        test_df = test_df.drop(columns=['target'])
    else:
        tv_df = tv_df.drop(columns=['num_sold'])
        test_df = test_df.drop(columns=['num_sold'])
            
    print("'num_sold' in test_df.columns == ", 'num_sold' in test_df.columns)
    
    if by_combo: # handling each combination of country, store, and product separately
        for store in stores:
            for product in products:
                for fold, (start, end) in enumerate(folds):
                    # Skip iteration if it's the last fold
                    if fold == len(folds) - 1:
                        continue
                    
                    # put only those rows in that are in the training window and have the correct country, store, and product
                    train_idx = (tv_df['date'] >= start) &\
                                (tv_df['date'] < end) &\
                                (tv_df['country'] == country) &\
                                (tv_df['store'] == store) &\
                                (tv_df['product'] == product)

#                     print(train_idx)

                    # redefine the training set in the local (holdout) sense
                    train = tv_df.loc[train_idx, :].reset_index(drop=True)
                    print(train.shape)

                    val_idx = (tv_df['date'] >= folds[fold + 1][0]) &\
                              (tv_df['date'] < folds[fold + 1][1]) &\
                              (tv_df['country'] == country) &\
                              (tv_df['store'] == store) &\
                              (tv_df['product'] == product)

                    val = tv_df.loc[val_idx, :].reset_index(drop=True)
                    
                    test_idx = (test_df['country'] == country) &\
                               (test_df['store'] == store) &\
                               (test_df['product'] == product)
                    test = test_df.loc[test_idx, :].reset_index(drop=True)
                    
                    # with the training and validation sets sorted out, make them integers for model fitting
                    for df in [train, val, test]:
                        df['date'] = df['date'].map(dt.datetime.toordinal)
        
                    X = train.drop(columns=[target])
                    y = train[target]
                    X_valid = val.drop(columns=[target])
                    y_valid = val[target]
                    X_test = test.drop(columns=[target])
        
                    model = estimator(model_kwargs)
                    model.fit(X,y)
                    
                    model_train_preds = model.predict(X)
                    model_valid_preds = model.predict(X_valid)
                    model_test_preds = model.predict(X_test)
                    
                    tv_df.loc[train_idx, 'model_forecast'] = model_train_preds.values
                    tv_df.loc[val_idx, 'model_forecast'] =  model_valid_preds.values
                    test_df.loc[test_idx, 'model_forecast'] = model_test_preds.values
                    
        
    else: # don't separate out by combination of country, store, and product
        # split the dataset such that 2018 is the validation set
        train = tv_df[tv_df['date'] > '2017-12-31']
        valid = tv_df[tv_df['date'] <= '2017-12-31']
        test = test_df.copy()
#         print(test_df['date'].dtype)
        print("NaNs after test instantiation: ", test.isna().sum())
    
        # convert the datetime objects to integers and then drop the transformed or untransformed target
            
        for df in [train, valid, test]:
            df['date'] = df['date'].map(dt.datetime.toordinal)

                
        # separate out the independent and dependent variables
        X = train.drop(columns=[target])
        y = train[target]
        X_valid = valid.drop(columns=[target])
        y_valid = valid[target]
        X_test = test.drop(columns=[target])
        
        print("NaNs after X_test instantiation: ", X_test.isna().sum())
        print("'num_sold' in X_test.columns == ", 'num_sold' in X_test.columns)
        
        model = estimator(**model_kwargs)
        model.fit(X,y)
        
        # make predictions on both the training and validation set, and put them together
        model_train_preds = model.predict(X)
        model_valid_preds = model.predict(X_valid)
        model_tv_preds = pd.concat([pd.Series(model_train_preds), pd.Series(model_valid_preds)], axis=1)
        
        # make predictions on the test set
        model_test_preds = model.predict(X_test)
#         print("model_test_preds.shape == ", model_test_preds.shape)
        
        tv_df['model_forecast'] = model_tv_preds
        test_df['model_forecast'] = model_test_preds
        
        # reverse the dependent variable transform if appropriate
        if target == 'target':
#             model_tv_preds = np.multiply(np.exp(model_tv_preds), tv_df['gdp']**gdp_exponent)
            tv_df['model_forecast'] = np.exp(tv_df['model_forecast']) * tv_df['gdp']**gdp_exponent
#             model_test_preds = np.multiply(np.exp(model_test_preds), test_df['gdp']**gdp_exponent)
            test_df['model_forecast'] = np.exp(test_df['model_forecast']) * test_df['gdp']**gdp_exponent
#             model_test_preds = np.exp(model_test_preds) * test_df['gdp']**gdp_exponent
        
#         tv_df['model_forecast'] = model_tv_preds
#         test_df['model_forecast'] = model_test_preds
    return tv_df, test_df
#     return tv_df['model_forecast'], test_df['model_forecast']
#     return model_tv_preds, model_test_preds
    

In [61]:
test_df_encoded

Unnamed: 0,date,country,store,product,num_sold,gdp,month,season,wd4,wd56,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
0,2019-01-01,0,0,1,,5.597614,1,1,False,False,...,False,False,False,False,False,False,False,False,False,
1,2019-01-01,0,0,0,,5.597614,1,1,False,False,...,False,False,False,False,False,False,False,False,False,
2,2019-01-01,0,0,2,,5.597614,1,1,False,False,...,False,False,False,False,False,False,False,False,False,
3,2019-01-01,0,1,1,,5.597614,1,1,False,False,...,False,False,False,False,False,False,False,False,False,
4,2019-01-01,0,1,0,,5.597614,1,1,False,False,...,False,False,False,False,False,False,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,2019-12-31,2,0,0,,6.282042,12,1,False,False,...,False,False,False,False,False,False,False,False,False,
6566,2019-12-31,2,0,2,,6.282042,12,1,False,False,...,False,False,False,False,False,False,False,False,False,
6567,2019-12-31,2,1,1,,6.282042,12,1,False,False,...,False,False,False,False,False,False,False,False,False,
6568,2019-12-31,2,1,0,,6.282042,12,1,False,False,...,False,False,False,False,False,False,False,False,False,


#### Calls

In [51]:
# %time
# prophet_tv_preds, prophet_test_preds = prophet_trainer(target='num_sold')

In [52]:
# %time
# neural_tv_preds, neural_test_preds = neuralprophet_trainer(target='num_sold')

In [64]:
%time
ridge_tv_preds, ridge_test_preds = sklearn_trainer(estimator=Ridge)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs
'num_sold' in test_df.columns ==  False
NaNs after test instantiation:  date           0
country        0
store          0
product        0
gdp            0
            ... 
easter55       0
easter56       0
easter57       0
easter58       0
target      6570
Length: 245, dtype: int64
NaNs after X_test instantiation:  date        0
country     0
store       0
product     0
gdp         0
           ..
easter54    0
easter55    0
easter56    0
easter57    0
easter58    0
Length: 244, dtype: int64
'num_sold' in X_test.columns ==  False


ValueError: Wrong number of items passed 2, placement implies 1

In [59]:
ridge_tv_preds

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26288,26289,26290,26291,26292,26293,26294,26295,26296,26297
0,349.173958,263.793801,,,,,,,,,...,,,,,,,,,,
1,560.849242,423.710160,,,,,,,,,...,,,,,,,,,,
2,163.252911,123.334243,,,,,,,,,...,,,,,,,,,,
3,609.217982,460.251756,,,,,,,,,...,,,,,,,,,,
4,978.536444,739.264319,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19723,,835.825248,,,,,,,,,...,,,,,,,,,,
19724,,244.190493,,,,,,,,,...,,,,,,,,,,
19725,,910.642260,,,,,,,,,...,,,,,,,,,,
19726,,1458.714888,,,,,,,,,...,,,,,,,,,,


In [57]:
len(prophet_tv_preds)

26298

### Forecasts
Now, create an iterable collection consisting of all the forecaster predictions.

In [None]:
forecast_tv_preds = {
    'prophet': prophet_tv_preds,
    'neuralprophet': neural_tv_preds,
}

forecast_test_preds = {
    'prophet': prophet_test_preds,
    'neuralprophet': neural_test_preds,
}

### Residuals

In [85]:
encoded_all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
0,0,2015-01-01,0,0,1,329.000000,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,3.738239
1,1,2015-01-01,0,0,0,520.000000,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.196010
2,2,2015-01-01,0,0,2,146.000000,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,2.925788
3,3,2015-01-01,0,1,1,572.000000,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.291321
4,4,2015-01-01,0,1,0,911.000000,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.756724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,32863,2019-12-31,2,0,0,615.873484,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6566,32864,2019-12-31,2,0,2,179.168451,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6567,32865,2019-12-31,2,1,1,675.905229,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6568,32866,2019-12-31,2,1,0,1078.850969,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,


In [89]:
dump(encoded_all_df, datapath/'encoded_train+testset_with_gdp+teckmengwong-time-features+transformed-target-for-train.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/datasets/encoded_train+testset_with_gdp+teckmengwong-time-features+transformed-target-for-train.joblib']

In [92]:
hybrid_train_df = encoded_all_df[:len(train_df)]
hybrid_valid_df = encoded_all_df[len(train_df): len(train_df)+len(valid_df)]

In [131]:
hybrid_test_df = encoded_all_df[len(train_df)+len(valid_df):]

In [93]:
hybrid_valid_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
19728,19728,2018-01-01,0,0,1,405.0,5.622500,1,1,False,...,False,False,False,False,False,False,False,False,False,3.910844
19729,19729,2018-01-01,0,0,0,621.0,5.622500,1,1,False,...,False,False,False,False,False,False,False,False,False,4.338288
19730,19730,2018-01-01,0,0,2,176.0,5.622500,1,1,False,...,False,False,False,False,False,False,False,False,False,3.077440
19731,19731,2018-01-01,0,1,1,714.0,5.622500,1,1,False,...,False,False,False,False,False,False,False,False,False,4.477839
19732,19732,2018-01-01,0,1,0,1043.0,5.622500,1,1,False,...,False,False,False,False,False,False,False,False,False,4.856813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26293,26293,2018-12-31,2,0,0,823.0,6.321586,12,1,False,...,False,False,False,False,False,False,False,False,False,4.477861
26294,26294,2018-12-31,2,0,2,250.0,6.321586,12,1,False,...,False,False,False,False,False,False,False,False,False,3.286366
26295,26295,2018-12-31,2,1,1,1004.0,6.321586,12,1,False,...,False,False,False,False,False,False,False,False,False,4.676652
26296,26296,2018-12-31,2,1,0,1441.0,6.321586,12,1,False,...,False,False,False,False,False,False,False,False,False,5.037997


In [106]:
hybrid_train_df['neural_preds'] = neural_train_preds
hybrid_train_df['neural_residual'] = hybrid_train_df['num_sold'] - neural_train_preds

In [133]:
neural_test_preds.shape

(6570, 6)

In [134]:
neural_test_preds

Unnamed: 0,row_id,date,country,store,product,neuralprophet_forecast
0,26298,2019-01-01,Finland,KaggleMart,Kaggle Mug,399.074371
1,26299,2019-01-01,Finland,KaggleMart,Kaggle Hat,580.306213
2,26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker,175.331146
3,26301,2019-01-01,Finland,KaggleRama,Kaggle Mug,680.846680
4,26302,2019-01-01,Finland,KaggleRama,Kaggle Hat,1002.552185
...,...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,698.816772
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,239.840546
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,751.112549
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,1036.387329


In [136]:
hybrid_test_df['neural_preds'] = neural_test_preds['neuralprophet_forecast']
hybrid_test_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target,neural_preds
0,26298,2019-01-01,0,0,1,330.860660,5.597614,1,1,False,...,False,False,False,False,False,False,False,False,,399.074371
1,26299,2019-01-01,0,0,0,530.288008,5.597614,1,1,False,...,False,False,False,False,False,False,False,False,,580.306213
2,26300,2019-01-01,0,0,2,153.666157,5.597614,1,1,False,...,False,False,False,False,False,False,False,False,,175.331146
3,26301,2019-01-01,0,1,1,579.630455,5.597614,1,1,False,...,False,False,False,False,False,False,False,False,,680.846680
4,26302,2019-01-01,0,1,0,929.004612,5.597614,1,1,False,...,False,False,False,False,False,False,False,False,,1002.552185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,32863,2019-12-31,2,0,0,615.873484,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,,698.816772
6566,32864,2019-12-31,2,0,2,179.168451,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,,239.840546
6567,32865,2019-12-31,2,1,1,675.905229,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,,751.112549
6568,32866,2019-12-31,2,1,0,1078.850969,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,,1036.387329


In [138]:
hybrid_test_residual_df = hybrid_test_df.drop(columns=['row_id', 'date', 'neural_preds', 'num_sold'])

In [141]:
hybrid_test_residual_df = hybrid_test_residual_df.drop(columns=['target'])
hybrid_test_residual_df

Unnamed: 0,country,store,product,gdp,month,season,wd4,wd56,sin1,cos1,...,easter47,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58
0,0,0,1,5.597614,1,1,False,False,1.721336e-02,0.999852,...,False,False,False,False,False,False,False,False,False,False
1,0,0,0,5.597614,1,1,False,False,1.721336e-02,0.999852,...,False,False,False,False,False,False,False,False,False,False
2,0,0,2,5.597614,1,1,False,False,1.721336e-02,0.999852,...,False,False,False,False,False,False,False,False,False,False
3,0,1,1,5.597614,1,1,False,False,1.721336e-02,0.999852,...,False,False,False,False,False,False,False,False,False,False
4,0,1,0,5.597614,1,1,False,False,1.721336e-02,0.999852,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,2,0,0,6.282042,12,1,False,False,-2.449294e-16,1.000000,...,False,False,False,False,False,False,False,False,False,False
6566,2,0,2,6.282042,12,1,False,False,-2.449294e-16,1.000000,...,False,False,False,False,False,False,False,False,False,False
6567,2,1,1,6.282042,12,1,False,False,-2.449294e-16,1.000000,...,False,False,False,False,False,False,False,False,False,False
6568,2,1,0,6.282042,12,1,False,False,-2.449294e-16,1.000000,...,False,False,False,False,False,False,False,False,False,False


In [107]:
hybrid_train_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target,neural_preds,neural_residual
0,0,2015-01-01,0,0,1,329.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,3.738239,337.860291,-8.860291
1,1,2015-01-01,0,0,0,520.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,4.196010,471.527588,48.472412
2,2,2015-01-01,0,0,2,146.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,2.925788,146.239853,-0.239853
3,3,2015-01-01,0,1,1,572.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,4.291321,571.667114,0.332886
4,4,2015-01-01,0,1,0,911.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,4.756724,800.308228,110.691772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19723,19723,2017-12-31,2,0,0,1037.0,6.295301,12,1,False,...,False,False,False,False,False,False,False,4.714042,561.964417,475.035583
19724,19724,2017-12-31,2,0,2,290.0,6.295301,12,1,False,...,False,False,False,False,False,False,False,3.439836,272.907135,17.092865
19725,19725,2017-12-31,2,1,1,1188.0,6.295301,12,1,False,...,False,False,False,False,False,False,False,4.849982,617.660156,570.339844
19726,19726,2017-12-31,2,1,0,1781.0,6.295301,12,1,False,...,False,False,False,False,False,False,False,5.254885,856.848389,924.151611


In [94]:
hybrid_valid_df['neural_preds'] = neural_valid_preds
hybrid_valid_df['neural_residual'] = hybrid_valid_df['num_sold'] - neural_valid_preds

In [95]:
hybrid_valid_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target,neural_preds,neural_residual
19728,19728,2018-01-01,0,0,1,405.0,5.622500,1,1,False,...,False,False,False,False,False,False,False,3.910844,381.928375,23.071625
19729,19729,2018-01-01,0,0,0,621.0,5.622500,1,1,False,...,False,False,False,False,False,False,False,4.338288,556.387573,64.612427
19730,19730,2018-01-01,0,0,2,176.0,5.622500,1,1,False,...,False,False,False,False,False,False,False,3.077440,168.243637,7.756363
19731,19731,2018-01-01,0,1,1,714.0,5.622500,1,1,False,...,False,False,False,False,False,False,False,4.477839,653.829590,60.170410
19732,19732,2018-01-01,0,1,0,1043.0,5.622500,1,1,False,...,False,False,False,False,False,False,False,4.856813,956.518311,86.481689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26293,26293,2018-12-31,2,0,0,823.0,6.321586,12,1,False,...,False,False,False,False,False,False,False,4.477861,688.424316,134.575684
26294,26294,2018-12-31,2,0,2,250.0,6.321586,12,1,False,...,False,False,False,False,False,False,False,3.286366,234.144318,15.855682
26295,26295,2018-12-31,2,1,1,1004.0,6.321586,12,1,False,...,False,False,False,False,False,False,False,4.676652,734.039551,269.960449
26296,26296,2018-12-31,2,1,0,1441.0,6.321586,12,1,False,...,False,False,False,False,False,False,False,5.037997,1014.238037,426.761963


In [108]:
dump(hybrid_train_df, datapath/'20220121_encoded-train_with_neural-preds+neural-residuals.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/datasets/20220121_encoded-train_with_neural-preds+neural-residuals.joblib']

In [96]:
dump(hybrid_valid_df, datapath/'20220121_encoded-valid_with_neural-preds+neural-residuals.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/datasets/20220121_encoded-valid_with_neural-preds+neural-residuals.joblib']

In [109]:
hybrid_train_residual_df = hybrid_train_df.drop(columns=['target', 'num_sold', 'neural_preds'])

In [97]:
hybrid_valid_residual_df = hybrid_valid_df.drop(columns=['target', 'num_sold', 'neural_preds'])

In [98]:
hybrid_valid_residual_df

Unnamed: 0,row_id,date,country,store,product,gdp,month,season,wd4,wd56,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,neural_residual
19728,19728,2018-01-01,0,0,1,5.622500,1,1,False,False,...,False,False,False,False,False,False,False,False,False,23.071625
19729,19729,2018-01-01,0,0,0,5.622500,1,1,False,False,...,False,False,False,False,False,False,False,False,False,64.612427
19730,19730,2018-01-01,0,0,2,5.622500,1,1,False,False,...,False,False,False,False,False,False,False,False,False,7.756363
19731,19731,2018-01-01,0,1,1,5.622500,1,1,False,False,...,False,False,False,False,False,False,False,False,False,60.170410
19732,19732,2018-01-01,0,1,0,5.622500,1,1,False,False,...,False,False,False,False,False,False,False,False,False,86.481689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26293,26293,2018-12-31,2,0,0,6.321586,12,1,False,False,...,False,False,False,False,False,False,False,False,False,134.575684
26294,26294,2018-12-31,2,0,2,6.321586,12,1,False,False,...,False,False,False,False,False,False,False,False,False,15.855682
26295,26295,2018-12-31,2,1,1,6.321586,12,1,False,False,...,False,False,False,False,False,False,False,False,False,269.960449
26296,26296,2018-12-31,2,1,0,6.321586,12,1,False,False,...,False,False,False,False,False,False,False,False,False,426.761963


In [99]:
from xgboost import XGBRegressor

In [121]:
xgboost_params = {
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'eval_metric': ['mae', 'mape', 'rmse'],
    'learning_rate': .09,
    'max_depth': 0,
    'subsample': .15,
    'sampling_method': 'gradient_based',
    'seed': 42,
    'grow_policy': 'lossguide',
    'max_leaves': 255,
    'lambda': 100,
    'n_estimators': 1500
}


In [122]:
xgb_model = XGBRegressor(verbose=True, **xgboost_params)

In [110]:
hybrid_train_residual_df

Unnamed: 0,row_id,date,country,store,product,gdp,month,season,wd4,wd56,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,neural_residual
0,0,2015-01-01,0,0,1,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,-8.860291
1,1,2015-01-01,0,0,0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,48.472412
2,2,2015-01-01,0,0,2,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,-0.239853
3,3,2015-01-01,0,1,1,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,0.332886
4,4,2015-01-01,0,1,0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,110.691772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19723,19723,2017-12-31,2,0,0,6.295301,12,1,False,True,...,False,False,False,False,False,False,False,False,False,475.035583
19724,19724,2017-12-31,2,0,2,6.295301,12,1,False,True,...,False,False,False,False,False,False,False,False,False,17.092865
19725,19725,2017-12-31,2,1,1,6.295301,12,1,False,True,...,False,False,False,False,False,False,False,False,False,570.339844
19726,19726,2017-12-31,2,1,0,6.295301,12,1,False,True,...,False,False,False,False,False,False,False,False,False,924.151611


In [111]:
X_residual = hybrid_train_residual_df.drop(columns=['row_id', 'date', 'neural_residual'])
y_residual = hybrid_train_residual_df['neural_residual']

In [140]:
X_residual.shape

(19728, 243)

In [142]:
X_residual

Unnamed: 0,country,store,product,gdp,month,season,wd4,wd56,sin1,cos1,...,easter47,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58
0,0,0,1,5.461456,1,1,False,False,1.721336e-02,0.999852,...,False,False,False,False,False,False,False,False,False,False
1,0,0,0,5.461456,1,1,False,False,1.721336e-02,0.999852,...,False,False,False,False,False,False,False,False,False,False
2,0,0,2,5.461456,1,1,False,False,1.721336e-02,0.999852,...,False,False,False,False,False,False,False,False,False,False
3,0,1,1,5.461456,1,1,False,False,1.721336e-02,0.999852,...,False,False,False,False,False,False,False,False,False,False
4,0,1,0,5.461456,1,1,False,False,1.721336e-02,0.999852,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19723,2,0,0,6.295301,12,1,False,True,-2.449294e-16,1.000000,...,False,False,False,False,False,False,False,False,False,False
19724,2,0,2,6.295301,12,1,False,True,-2.449294e-16,1.000000,...,False,False,False,False,False,False,False,False,False,False
19725,2,1,1,6.295301,12,1,False,True,-2.449294e-16,1.000000,...,False,False,False,False,False,False,False,False,False,False
19726,2,1,0,6.295301,12,1,False,True,-2.449294e-16,1.000000,...,False,False,False,False,False,False,False,False,False,False


In [112]:
X_valid_residual = hybrid_valid_residual_df.drop(columns=['row_id', 'date', 'neural_residual'])
y_valid_residual = hybrid_valid_residual_df['neural_residual']

In [123]:
xgb_model.fit(X_residual, y_residual)

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             eval_metric=['mae', 'mape', 'rmse'], gamma=0, gpu_id=0,
             grow_policy='lossguide', importance_type=None,
             interaction_constraints='', lambda=100, learning_rate=0.09,
             max_delta_step=0, max_depth=0, max_leaves=255, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=1500,
             n_jobs=16, num_parallel_tree=1, predictor='gpu_predictor',
             random_state=42, reg_alpha=0, reg_lambda=100,
             sampling_method='gradient_based', scale_pos_weight=1, seed=42, ...)

In [124]:
xgb_valid_preds = xgb_model.predict(X_valid_residual)

In [127]:
SMAPE(y_pred=xgb_valid_preds, y_true=hybrid_valid_residual_df['neural_residual'])

28.95007133081958

In [128]:
from sklearn.metrics import mean_squared_error

In [130]:
math.sqrt(mean_squared_error(y_pred=xgb_valid_preds, y_true=hybrid_valid_residual_df['neural_residual']))

43.21746203067612

In [143]:
xgb_residual_preds = xgb_model.predict(hybrid_test_residual_df)

In [144]:
xgb_residual_preds

array([-2.5383317e-01,  2.9157864e+01, -7.4216886e+00, ...,
        2.6698178e+02,  5.1559656e+02,  6.0392628e+01], dtype=float32)

In [145]:
xgb_residual_preds.shape

(6570,)

In [125]:
residual_comparison = pd.DataFrame({
    'xgb': xgb_valid_preds,
    'actual': hybrid_valid_residual_df['neural_residual']
})

In [126]:
residual_comparison

Unnamed: 0,xgb,actual
19728,-0.253833,23.071625
19729,29.157864,64.612427
19730,-7.421689,7.756363
19731,-15.089988,60.170410
19732,88.849304,86.481689
...,...,...
26293,318.744751,134.575684
26294,19.579929,15.855682
26295,403.339813,269.960449
26296,668.112305,426.761963


### Submission for NeuralProphet baseline + XGBoost predicted residuals

In [146]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [148]:
neural_test_preds.columns

Index(['row_id', 'date', 'country', 'store', 'product',
       'neuralprophet_forecast'],
      dtype='object')

In [149]:
sample_df.loc[:, 'num_sold'] = neural_test_preds['neuralprophet_forecast'] + xgb_residual_preds

In [150]:
sample_df.head()

Unnamed: 0,row_id,num_sold
0,26298,398.820538
1,26299,609.464077
2,26300,167.909458
3,26301,665.756692
4,26302,1091.401489


In [151]:
sample_df.to_csv(subpath/f"20220121_NeuralProphet-baseline+XGBoost-residuals_preds.csv", index=False)
# sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-X_orig+KMeans8+synth-GBM-stack_ensemble_preds.csv", index=False)

Linear models from Scikit-Learn seemingly require that datetime data be converted to numerics.

In [77]:
import datetime as dt

In [78]:
train_linear_df = train_df.copy()
valid_linear_df = valid_df.copy()
test_linear_df = test_df.copy()

train_linear_df['date'] = train_df['date'].map(dt.datetime.toordinal)
valid_linear_df['date'] = valid_df['date'].map(dt.datetime.toordinal)
test_linear_df['date'] = test_df['date'].map(dt.datetime.toordinal)

In [79]:
train_linear_df

Unnamed: 0,date,country,store,product,gdp,month,season,wd4,wd56,sin1,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
0,735599,0,0,1,5.461456,1,1,False,False,1.721336e-02,...,False,False,False,False,False,False,False,False,False,3.738239
1,735599,0,0,0,5.461456,1,1,False,False,1.721336e-02,...,False,False,False,False,False,False,False,False,False,4.196010
2,735599,0,0,2,5.461456,1,1,False,False,1.721336e-02,...,False,False,False,False,False,False,False,False,False,2.925788
3,735599,0,1,1,5.461456,1,1,False,False,1.721336e-02,...,False,False,False,False,False,False,False,False,False,4.291321
4,735599,0,1,0,5.461456,1,1,False,False,1.721336e-02,...,False,False,False,False,False,False,False,False,False,4.756724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19723,736694,2,0,0,6.295301,12,1,False,True,-2.449294e-16,...,False,False,False,False,False,False,False,False,False,4.714042
19724,736694,2,0,2,6.295301,12,1,False,True,-2.449294e-16,...,False,False,False,False,False,False,False,False,False,3.439836
19725,736694,2,1,1,6.295301,12,1,False,True,-2.449294e-16,...,False,False,False,False,False,False,False,False,False,4.849982
19726,736694,2,1,0,6.295301,12,1,False,True,-2.449294e-16,...,False,False,False,False,False,False,False,False,False,5.254885


This is the pattern for reversing the transform:
```python
ridge_trainset_preds = np.exp(ridge.predict(X_all)) * X_all['gdp']**gdp_exponent # from AmbrosM
```

In [66]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [67]:
sample_df.loc[:, 'num_sold'] = encoded_test_df['num_sold']

In [68]:
sample_df.head()

Unnamed: 0,row_id,num_sold
0,26298,331
1,26299,530
2,26300,154
3,26301,580
4,26302,929


In [69]:
sample_df.to_csv(subpath/f"{os.environ['WANDB_NOTEBOOK_NAME']}_ridge_preds.csv", index=False)
# sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-X_orig+KMeans8+synth-GBM-stack_ensemble_preds.csv", index=False)

#### Lasso

In [50]:
from sklearn.linear_model import Lasso

In [52]:
lasso_model = Lasso()

In [53]:
lasso_model.fit(X,y)

Lasso()

In [55]:
lasso_valid_preds = lasso_model.predict(X_valid)

In [57]:
SMAPE(y_pred=lasso_preds, y_true=y_valid)

14.719972647663342

Not so good there.

#### Huber

In [58]:
huber_model = HuberRegressor()

In [59]:
huber_model.fit(X,y)

HuberRegressor()

In [60]:
huber_valid_preds = huber_model.predict(X_valid)

In [61]:
SMAPE(y_pred=huber_valid_preds, y_true=y_valid)

14.808248729247591

That's not so great.

#### MLPRegressor

In [63]:
mlp_model = MLPRegressor(hidden_layer_sizes=(200,100), learning_rate_init=0.01, early_stopping=True, max_iter=300, random_state=42, learning_rate='adaptive')

In [64]:
mlp_model.fit(X,y)

MLPRegressor(early_stopping=True, hidden_layer_sizes=(200, 100),
             learning_rate='adaptive', learning_rate_init=0.01, max_iter=300,
             random_state=42)

In [65]:
mlp_preds = mlp_model.predict(X_valid)

In [66]:
SMAPE(y_pred=mlp_preds, y_true=y_valid)

15.639894056665263

Not great, that. Let's try at least one more set of hyperparams.

In [67]:
mlp_model = MLPRegressor(hidden_layer_sizes=(200,100,50), 
                         learning_rate_init=0.01, 
                         early_stopping=True, 
                         max_iter=300, 
                         random_state=42, 
                         learning_rate='adaptive')

In [68]:
mlp_model.fit(X,y)

MLPRegressor(early_stopping=True, hidden_layer_sizes=(200, 100, 50),
             learning_rate='adaptive', learning_rate_init=0.01, max_iter=300,
             random_state=42)

In [69]:
mlp_preds = mlp_model.predict(X_valid)

In [70]:
SMAPE(y_pred=mlp_preds, y_true=y_valid)

14.763122872881882

Marginally better. Let's try one more layer.

In [71]:
mlp_model = MLPRegressor(hidden_layer_sizes=(200,100,50,25), 
                         learning_rate_init=0.01, 
                         early_stopping=True, 
                         max_iter=300, 
                         random_state=42, 
                         learning_rate='adaptive')

In [72]:
mlp_model.fit(X,y)

MLPRegressor(early_stopping=True, hidden_layer_sizes=(200, 100, 50, 25),
             learning_rate='adaptive', learning_rate_init=0.01, max_iter=300,
             random_state=42)

In [73]:
mlp_preds = mlp_model.predict(X_valid)

In [74]:
SMAPE(y_pred=mlp_preds, y_true=y_valid)

14.862419246640776

A bit worse there.

#### Linear Regression

In [75]:
from sklearn.linear_model import LinearRegression

In [76]:
linreg_model = LinearRegression(fit_intercept=False)

In [77]:
linreg_model.fit(X,y)

LinearRegression(fit_intercept=False)

In [78]:
linreg_preds = linreg_model.predict(X_valid)

In [79]:
SMAPE(y_pred=linreg_preds, y_true=y_valid)

6.752939183524428

That's better

### Pure Time Series (Prophet, NeuralProphet)

For this, just use the trainers in other files

### GBMs

Now, I'm going to try training the GBMs, stripping out the years. I won't -- yet -- make them residual-only, however. I'll double back to the `tv_df`, pre-validation splitting.

In [74]:
tv_df_gbm = tv_df.drop(columns=['date'])

In [76]:
tv_df_gbm.columns

Index(['country', 'store', 'product', 'gdp', 'month', 'season', 'wd4', 'wd56',
       'sin1', 'cos1',
       ...
       'easter50', 'easter51', 'easter52', 'easter53', 'easter54', 'easter55',
       'easter56', 'easter57', 'easter58', 'target'],
      dtype='object', length=244)

Going to bring over my old cross-trainer function, give it a whirl.

In [47]:
def cross_validate_model(arch:str, X=X, y=y, X_test=X_test, model_params:dict={}, training_params=training_params, dataset_params=dataset_params,
                         folds=list(range(folds)), exmodel_config=exmodel_config, wandb_config=wandb_config,  telegram=True, random_state=42, 
                         wandb_tracked=True, encode_cats=False):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
    # if exmodel_config['kfolds'] == 1: # holdout case
    #     print("Proceeding with holdout")
    #     X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, 
    #                                                           random_state=SEED)                 
    # else: # k-fold cross validation case
    #     # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    #     # shuffle on the initial sets, but not subsequently -- performing the same operation twice means a very different dataset
    #     if shuffle_kfolds:
    #         kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    #     else:
    #         kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)
    
    kfold = training_params['cross_val_strategy']
    zz
    if wandb_tracked:
        exmodel_config['arch'] = arch
        exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202112_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    
    test_preds = np.zeros((X_test.shape[0]))
    # test_probs = np.zeros((X_test.shape[0]))
    # preprocessing
    # if using a GBM, simply use the RobustScaler
        # scaler = RobustScaler()
        # X = scaler.fit_transform(X)
        # X_test = scaler.transform(X_test)
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        if fold not in folds: # skip folds that are already trained, i.e. that haven't been specified
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            y_train, y_valid = y[train_ids], y[valid_ids] # y will be an np.ndarray already; handling will be same regardless of model
            if isinstance(X, np.ndarray):
                X_train, X_valid = X[train_ids], X[valid_ids]
            else:
                X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:] # bc need pandas.DataFrames for ce
                
                # scaling
                # category_encoding
                # if encode_cats:
                #     encoder = ce.WOEEncoder(cols=categoricals)
                #     encoder.fit(X_train,y_train)
                #     X_train = encoder.transform(X_train)
                #     X_valid = encoder.transform(X_valid)
                # # exmodel_config['feature_count'] = len(X.columns)
                #     wandb.log({
                #         'feature_count': X_train.shape[1],
                #         'instance_count': X_train.shape[0],
                #         'encoder': str(encoder)
                #     })
        
        # define models
        if arch == 'xgboost':
            model = XGBClassifier(
                booster='gbtree',
                tree_method='gpu_hist',
                random_state=random_state,
                n_jobs=-1, 
                verbosity=1, 
                objective='binary:logistic',
                **model_params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
            else:
                model.fit(X_train, y_train)
            
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict(X_test)
            # test_probs += model.predict_proba(X_test)


        elif arch == 'lightgbm':
            # try:
            model = LGBMClassifier(
                objective='binary',
                random_state=random_state,
#                     device_type='cpu',
#                     n_jobs=-1,
#                 eval_metric='auc',
                device_type='gpu',
                max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
                gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
                **model_params)

            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
            else:
                model.fit(X_train, y_train)
#             except LightGBMError:
#                 model = LGBMClassifier(
#                     objective='binary',
#                     random_state=random_state,
#                     device_type='cpu',
#                     n_jobs=-1,
#     #                 eval_metric='auc',
#     #                 device_type='gpu',
#     #                 max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
#     #                 gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
#                     **params)
                
#                 if wandb_tracked:
#                     model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
#                 else:
#                     model.fit(X_train, y_train)
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict(X_test)
            # test_probs += model.predict_proba(X_test)[:,1]

            
        elif arch == 'catboost':
            model = CatBoostClassifier(
                task_type='GPU',
                silent=True,
                random_state=random_state,
                **model_params) 
        
            model.fit(X_train, y_train)
            
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)[:,1] # this would only take one of 7 cols
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict(X_test).flatten()
            # test_probs += model.predict_proba(X_test)[:,1]
            
#         valid_loss = log_loss(y_valid, y_pred)
        # give the valid AUC score, for edification

        fold_accuracy = accuracy_score(y_true=y_valid, y_pred=y_valid_preds) # or should be preds?
        # fold_confusion = confusion_matrix(y_true=y_valid, y_pred=y_valid_preds)# , labels=list(range(7)))
        # fold_log_loss = log_loss(y_pred=y_valid_preds, y_true=y_valid,) #labels=list(range(7)))
        # fold_roc_auc = roc_auc_score(y_true=y_valid, y_score=y_valid_probs)
        # fold_f1_score = f1_score(
        # fold_fbeta_score = fbeta_score(
        
        if wandb_tracked:
            wandb.log({f'fold{fold}_accuracy': fold_accuracy,
                       # f'fold{fold}_confusion': fold_confusion,
                       # f'fold{fold}_log_loss': fold_log_loss,
                       # f'fold{fold}_roc_auc': fold_roc_auc,
                      })
        fold_human_results = f"{os.environ['WANDB_NOTEBOOK_NAME']}\nMetrics for fold {fold} are: \nAccuracy: {fold_accuracy}"
        print(fold_human_results)
        if telegram:
            send_tg_message(text=f"{arch} model's fold {fold} complete.\n"+fold_human_results)
        # dump(model, Path(runpath/f"{arch}_fold{fold}_rs{random_state}_model.joblib"))

    model_accuracy = accuracy_score(y_true=oof_y, y_pred=oof_preds) 
    # model_confusion = confusion_matrix(y_true=oof_y, y_pred=oof_preds, labels=list(range(7)))
    # model_log_loss = log_loss(y_pred=oof_preds, y_true=oof_y, labels=list(range(7)))
    # model_valid_auc = roc_auc_score(oof_y, oof_preds)
    model_human_results = f"{os.environ['WANDB_NOTEBOOK_NAME']}\nMetrics for model {arch} are: \nAccuracy: {model_accuracy}"
    print(model_human_results)
    if telegram:
        send_tg_message(text=f"{arch} model run complete.\n"+model_human_results)
    if wandb_tracked:
        wandb.log({f'model_accuracy': fold_accuracy,
                   # f'model_confusion': fold_confusion,
                   # f'model_log_loss': fold_log_loss,
                   # f'model_roc_auc': fold_roc_auc,
                   'model_params': str(model.get_params()),
                   'model_seed': random_state,
                  })
        wandb.finish()
    
    # finalize test preds
    # test_probs /= exmodel_config['kfolds']
    # test_preds /= exmodel_config['kfolds']
    
    
    # save OOF preds and test-set preds
#     if 'widedeep' in arch:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_test_preds.joblib"))
    
#     else:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    
    # if not (datapath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib").is_file():
    #     dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
#     if wandb_tracked:
# #         if 'widedeep' in arch:
#         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
#                    'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()), 
#         #                    'model_params': str(model.get_params()),
#         })
# #         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
# # #                    'model_params': str(model.get_params()),
# #                   })
#         wandb.finish()
    return oof_preds, test_preds#, model_confusion
        

  and should_run_async(code)


# Old stuff

## Training Params

In [16]:
# training_params = {
#     'general_random_state': SEED,
# }

# folds = 5
# training_params['cross_val_strategy'] = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)

## Model Params

## Metadata

In [None]:
# # baseline -- alter as needed later
# exmodel_config = {
#     'general_random_state': SEED,
# #     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     **dataset_params,
# #     **training_params,
# #     **model_params # perhaps do later
# }

## WandB Config

In [None]:
# # wandb config:
# wandb_config = {
#     'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
#     'tags': ['EDA'],
#     'notes': "EDA"
# }