# Hybrid
Going to attempt a hybrid model after the example of [this Teck Meng Wong notebook](https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series/notebook).

- 20220122: Going to try to form ensembles, with more code architecture. Forecasting models will include Prophet, NeuralProphet, Ridge, and Linear (with more to come -- e.g. perhaps transformers and other DNNs); residual models will include GBMs, perhaps some tabular DNNs too.

In [1]:
# notebook configuration
# if '/sf/' in pwd:
#     COLAB, SAGE = False, False
# elif 'google.colab' in str(get_ipython()):
#     COLAB, SAGE = True, False # do colab-specific installs later
# else:
#     COLAB, SAGE = False, True
    
CONTEXT = 'local' # or 'colab', 'sage', 'kaggle'
USE_GPU = True 
%config Completer.use_jedi = False

## Imports

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import requests # for telegram notifications
from tqdm.notebook import tqdm

from joblib import dump, load

import datetime as dt

Now, non-stdlib imports

In [3]:
# model selection
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

# metrics
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

# normalization
# from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
# from gauss_rank_scaler import GaussRankScaler

# feature generation
# import category_encoders as ce

# models -- will be imported JIT
# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
# from pytorch_widedeep import Trainer
# from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
# from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT#, TabTransformer, TabNet, TabFastFormer, TabResnet
# from pytorch_widedeep.metrics import Accuracy
# from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

# feature reduction
# from sklearn.decomposition import PCA
# from umap import UMAP

# clustering
# from sklearn.cluster import DBSCAN, KMeans
# import hdbscan

# feature selection
# from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
# import featuretools as ft
# from BorutaShap import BorutaShap
# from boruta import BorutaPy

# tracking 
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
os.environ['WANDB_NOTEBOOK_NAME'] = f"nb_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# # time series
# import tsfresh

# import darts
# from darts import TimeSeries
# from darts.models import ExponentialSmoothing, AutoARIMA, ARIMA, Prophet, RandomForest, RegressionEnsembleModel, RegressionModel, TFTModel, TCNModel, TransformerModel, NBEATSModel
import holidays
import dateutil.easter as easter
from prophet import Prophet
from neuralprophet import NeuralProphet

## Routing

Now, datapath setup

In [5]:
if CONTEXT == 'colab':
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    # datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/dec2021/')
    root = Path('') # TODO

elif CONTEXT == 'sage':
    root = Path('') # TODO
    
elif CONTEXT == 'kaggle':
    root = Path('') # TODO
    
else: # if on local machine
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    studypath = root/'studies'
    
    for pth in [datapath, predpath, subpath, studypath]:
        pth.mkdir(exist_ok=True)

## Helpers

In [6]:
SEED = 42

# Function to seed everything but the models
def seed_everything(seed, pytorch=True, reproducible=True):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if pytorch:
        torch.manual_seed(seed) # set torch CPU seed
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed) # set torch GPU(s) seed(s)
        if reproducible and torch.backends.cudnn.is_available():
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

seed_everything(seed=SEED)

In [7]:
def reduce_memory_usage(df, verbose=True):
    """
    Function to reduce memory usage by downcasting datatypes in a Pandas DataFrame when possible.
    
    h/t to Bryan Arnold (https://www.kaggle.com/puremath86/label-correction-experiments-tps-nov-21)
    """
    
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [8]:
tg_api_token = 'your_api_token' # for Galileo (jupyter_watcher_bot) on Telegram
tg_chat_id = 'your_chat_id'

import requests

def send_tg_message(text='Cell execution completed.'):  
    """
    h/t Ivan Dembicki Jr. for the base version 
    (https://medium.com/@ivan.dembicki.jr/notifications-in-jupyter-notebook-with-telegram-f2e892c55173)
    """
    requests.post('https://api.telegram.org/' +  'bot{}/sendMessage'.format(tg_api_token),
                  params=dict(chat_id=tg_chat_id, text=text))

In [9]:
def SMAPE(y_true, y_pred):
    '''
    h/t Jean-François Puget (@CPMP) -- see https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
    '''
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [10]:
# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/282735
def better_than_median(inputs, axis):
    """Compute the mean of the predictions if there are no outliers,
    or the median if there are outliers.

    Parameter: inputs = ndarray of shape (n_samples, n_folds)"""
    spread = inputs.max(axis=axis) - inputs.min(axis=axis) 
    spread_lim = 0.45
    print(f"Inliers:  {(spread < spread_lim).sum():7} -> compute mean")
    print(f"Outliers: {(spread >= spread_lim).sum():7} -> compute median")
    print(f"Total:    {len(inputs):7}")
    return np.where(spread < spread_lim,
                    np.mean(inputs, axis=axis),
                    np.median(inputs, axis=axis))

In [11]:
# https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series
def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

In [12]:
# https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series
def fourier_features(index, freq, order):
    time = np.arange(len(index), dtype=np.float32)
    k = 2 * np.pi * (1 / freq) * time
    features = {}
    for i in range(1, order + 1):
        features.update({
            f"sin_{freq}_{i}": np.sin(i * k),
            f"cos_{freq}_{i}": np.cos(i * k),
        })
    return pd.DataFrame(features, index=index)

## Dataset Setup

### Original Data Loading

In [13]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    'train_source': str(datapath/'train.csv'),
    'target_source': str(datapath/'train.csv'),
    'test_source': str(datapath/'test.csv'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
train_df = pd.read_csv(datapath/'train.csv')
test_df = pd.read_csv(datapath/'test.csv')
orig_train_df = train_df.copy()
orig_test_df = test_df.copy()

Since the dates are natively `Object` dtype (i.e. strings), we have to convert them:

In [14]:
# https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model
for df in [train_df, test_df]:
    df['date'] = pd.to_datetime(df.date)

# for convenience later
countries = ['Sweden', 'Finland', 'Norway']
stores = ['KaggleMart', 'KaggleRama']
products = ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']

Provisionally, I'm going to concatenate together the `train_df` and `test_df` for preprocessing, to avoid having to constantly apply transforms twice (since I don't anticipate doing any transforms that might allow data leakage to occur).

In [15]:
all_df = pd.concat([train_df, test_df], axis=0)
# all_df.columns
print(len(all_df) == len(train_df) + len(test_df))
del train_df, test_df

True


### GDP Data
Here's data from Carl McBride Ellis ([notebook](https://www.kaggle.com/carlmcbrideellis/gdp-of-finland-norway-and-sweden-2015-2019) and [dataset](https://www.kaggle.com/carlmcbrideellis/gdp-20152019-finland-norway-and-sweden) for doing GDP comparisons. They're frequently used in other entries. I've created a function to add them on.

In [16]:
def add_gdp_data(df):
    gdp_df = pd.read_csv(datapath/'GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
    gdp_df.set_index('year', inplace=True)
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]

    df['gdp'] = np.log1p(df.apply(get_gdp, axis=1))
    return df

I'll also define here (but perhaps move later) the GDP exponent, which will be used to transform the targets before inference (dividing num_sold by the $GDP^{1.212}$ and then taking the logarithm (after @ambrosm)

In [17]:
gdp_exponent = 1.2121103201489674 # see https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model for an explanation

In [18]:
all_df = add_gdp_data(all_df)

In [19]:
all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329.0,5.461456
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520.0,5.461456
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146.0,5.461456
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572.0,5.461456
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911.0,5.461456
...,...,...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,,6.282042
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,,6.282042
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,,6.282042
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,,6.282042


## Feature Engineering

### Time Features

The goal of this function is to create features that will capture seasonalities -- but **not** trends. The trends will (hopefully) be captured by the deployment of linear forecasting algorithms on raw time series data (consisting exclusively of dates and targets); we want to have seasonalities that the residual models can learn, however -- holidays, weekly patterns, climactic season patterns, etc.

The cell below will generate the `holidays` library's entries for the three countries. I may want to follow the template of @teckmengwong's code below, and add more holidays -- then, do some feature importance checking, and perhaps whittle down the features accordingly.

In [20]:
for c in [holidays.Finland, holidays.Sweden, holidays.Norway]:
#     print(c)
    for h in c(years = [2019], observed=True).items():
#         print(h)
        pass

In [21]:
def temporal_engineering(df):
    '''
    Function inspired by / borrowing from @teckmengwong and @ambrosm to create time features that will
    capture seasonality.
    '''
    
#     df[YEAR] = df[DATE].dt.year
    df['month'] = df['date'].dt.month
#     df['week'] = df['date'].dt.week # not used by Teck Meng Wong
#     df['day'] = df['date'].dt.day # not used by Teck Meng Wong
#     df['day_of_year'] = df['date'].dt.dayofyear # not used by Teck Meng Wong
#     df['day_of_month'] = df['date'].dt.days_in_month # not used by Teck Meng Wong
#     df['day_of_week'] = df['date'].dt.dayofweek # not used by Teck Meng Wong
#    df['weekday'] = df['date'].dt.weekday # not used by Teck Meng Wong
    # Teck Meng Wong mapped the integers to first-letters in triplets
    # I'm leaving it as integers, where winter=1, spring=2, summer=3, fall=4
    df['season'] = ((df['date'].dt.month % 12 + 3) // 3) #.map({1:'DJF', 2: 'MAM', 3:'JJA', 4:'SON'})
#     df['month'] = df['month'].apply(lambda x: calendar.month_abbr[x])

    df['wd4'] = df['date'].dt.weekday == 4
    df['wd56'] = df['date'].dt.weekday >= 5
#     df['wd6'] = df['date'].dt.weekday >= 6
#     df.loc[(df.date.dt.year != 2016) & (df.date.dt.month >=3), 'day_of_year'] += 1 # fix for leap years
    
    # 21 days cyclic for lunar
    dayofyear = df.date.dt.dayofyear # for convenience
    
    # here he's creating Fourier features
    for k in range(1, 32, 4):
        df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        df[f'finland_sin{k}'] = np.where(df['country'] == 'Finland', df[f'sin{k}'], 0)
        df[f'finland_cos{k}'] = np.where(df['country'] == 'Finland', df[f'cos{k}'], 0)
        df[f'norway_sin{k}'] = np.where(df['country'] == 'Norway', df[f'sin{k}'], 0)
        df[f'norway_cos{k}'] = np.where(df['country'] == 'Norway', df[f'cos{k}'], 0)
        df[f'store_sin{k}'] = np.where(df['store'] == 'KaggleMart', df[f'sin{k}'], 0)
        df[f'store_cos{k}'] = np.where(df['store'] == 'KaggleMart', df[f'cos{k}'], 0)
        df[f'mug_sin{k}'] = np.where(df['product'] == 'Kaggle Mug', df[f'sin{k}'], 0)
        df[f'mug_cos{k}'] = np.where(df['product'] == 'Kaggle Mug', df[f'cos{k}'], 0)
        df[f'sticker_sin{k}'] = np.where(df['product'] == 'Kaggle Sticker', df[f'sin{k}'], 0)
        df[f'sticker_cos{k}'] = np.where(df['product'] == 'Kaggle Sticker', df[f'cos{k}'], 0)
    
#     df[f'semiweekly_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 14)
#     df[f'semiweekly_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 14)
#     df[f'lunar_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 21)
#     df[f'lunar_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 21)
    df[f'season_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 91.5)
    df[f'season_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 91.5)
#     df = pd.concat([df, pd.DataFrame({f'fin{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Finland')
#                                       for ptr in holidays.Finland(years = [2015,2016,2017,2018,2019]).items()})], axis=1)
#     df = pd.concat([df, pd.DataFrame({f'nor{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Norway')
#                                       for ptr in holidays.Norway(years = [2015,2016,2017,2018,2019]).items()})], axis=1)
#     df = pd.concat([df, pd.DataFrame({f'swe{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Sweden')
#                                       for ptr in holidays.Sweden(years = [2015,2016,2017,2018,2019]).items()})], axis=1)

    # End of year
    # Dec - teckmengwong
    for d in range(24, 32):
        df[f"dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d)
    # I'm unsure of the logic of only doing this for Norway
    for d in range(24, 32):
        df[f"n-dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
    
    # not sure why he's using different date ranges for each country here
    # Jan - teckmengwong
    for d in range(1, 14):
        df[f"f-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
    for d in range(1, 10):
        df[f"n-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
    for d in range(1, 15):
        df[f"s-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
    
    
    # May - tekcmengwong
    for d in list(range(1, 10)): # May Day and after, I guess
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d)
    for d in list(range(19, 26)):
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
    # June 
    for d in list(range(8, 14)):
        df[f"june{d}"] = (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
    
    #Swedish Rock Concert - teckmengwong
    #Jun 3, 2015 – Jun 6, 2015
    #Jun 8, 2016 – Jun 11, 2016
    #Jun 7, 2017 – Jun 10, 2017
    #Jun 6, 2018 – Jun 10, 2018
    #Jun 5, 2019 – Jun 8, 2019
    swed_rock_fest  = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-6')),
                                         2016: pd.Timestamp(('2016-06-11')),
                                         2017: pd.Timestamp(('2017-06-10')),
                                         2018: pd.Timestamp(('2018-06-10')),
                                         2019: pd.Timestamp(('2019-06-8'))})

    df = pd.concat([df, pd.DataFrame({f"swed_rock_fest{d}":
                                      (df.date - swed_rock_fest == np.timedelta64(d, "D")) & (df.country == 'Sweden')
                                      for d in list(range(-3, 3))})], axis=1)

    
    # Last Wednesday of June - teckmengwong
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    for d in list(range(-4, 6)):
        df[f"wed_june{d}"] = (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
        
    # First Sunday of November - teckmengwong
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df = pd.concat([df, pd.DataFrame({f"sun_nov{d}":
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country == 'Norway')
                                      for d in list(range(0, 9))})], axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December) -teckmengwong
    df = pd.concat([df, pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))})], axis=1)
    
    # Easter -teckmengwong
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df = pd.concat([df, pd.DataFrame({f"easter{d}":
                                      (df.date - easter_date == np.timedelta64(d, "D"))
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})], axis=1)
    
    return df

In [22]:
temporal_all_df = temporal_engineering(all_df)

In [23]:
temporal_all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter47,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,False


At this point, the `temporal_all_df` DataFrame contains all the time features for both the training and testing sets.
* **Todo**: consider not only adding in holidays from `holidays`, but also borrowing ideas from the AmbrosM Linear notebook too (which creates fewer features, populating them instead with temporal distances from the selected holidays).

### Target Transformation
Now, I'll do the target transformation proposed by @AmbrosM. (I'll do it to the non-encoded DataFrame too, for testing with Prophet and NeuralProphet later.)

In [24]:
for df in [temporal_all_df]:
    df['target'] = np.log(df['num_sold'] / df['gdp']**gdp_exponent)

In [25]:
# encoded_all_df['target'] = np.log(encoded_all_df['num_sold'] / (encoded_all_df['gdp']**gdp_exponent))

In [26]:
temporal_all_df

Unnamed: 0,row_id,date,country,store,product,num_sold,gdp,month,season,wd4,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,3.738239
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.196010
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,2.925788
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.291321
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911.0,5.461456,1,1,False,...,False,False,False,False,False,False,False,False,False,4.756724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,,6.282042,12,1,False,...,False,False,False,False,False,False,False,False,False,


### Label Encoding

I'm going to encapsulate this in a function so that it can be invoked just-in-time, in the hopes of avoiding confusions with DataFrames.

In [27]:
def label_encoder(df):
    from sklearn.preprocessing import LabelEncoder
    features = ['country', 'product', 'store']
    le_dict = {feature: LabelEncoder().fit(orig_train_df[feature]) for feature in features}
    enc_df = df.copy()
    for feature in features:
        enc_df[feature] = le_dict[feature].transform(df[feature])
    return le_dict, enc_df

In [28]:
# for key in le_dict.keys():
#     print(f"Values for key {key} are {le_dict[key].inverse_transform(range(len(le_dict[key].values())))}")#"
# print(le_dict['country'].inverse_transform([0,1,2]))
# print(le_dict['product'].inverse_transform([0,1,2]))
# print(le_dict['store'].inverse_transform([0,1]))

```
['Finland' 'Norway' 'Sweden']
['Kaggle Hat' 'Kaggle Mug' 'Kaggle Sticker']
['KaggleMart' 'KaggleRama']
```

Now, we'll do the encoding.

At this point, the `encoded_all_df` can be used -- perhaps with a call to `LabelEncoder.inverse_transform` -- to recover the "original" data when necessary (e.g. for feeding it into Prophet and NeuralProphet)

In [29]:
# encoded_all_df = label_encoder(temporal_all_df)

### Pseudolabeling

I'm not going to try this right now, but I may return to it later -- I note that Teck Meng Wong had some good results with it.

### Data Splitting, Modification

Now that the preprocessing is done, I'm going to split the data back into the train and test sets; then, I'll create a view on the dataframes that omits the year. The year-less dataframes will be suitable for residual learning.

In [30]:
# all_df = encoded_all_df.drop(columns=['num_sold', 'row_id'])
all_df = temporal_all_df.drop(columns=['row_id']) # writing over the previous version of `all_df`

In [31]:
tv_df = all_df[:len(orig_train_df)] # training and validation sets -- still not encoded
test_df = all_df[len(orig_train_df):] # still not encoded


In [32]:
# train_df = encoded_all_df.iloc[np.where(encoded_all_df['date'] < '2019-01-01'), :]
# test_df = encoded_all_df[[np.where(encoded_all_df['date'] > '2018-12-31')]]

# encoded_tv_df = encoded_all_df.drop(columns=['row_id'])[:len(orig_train_df)]
# encoded_test_df = encoded_all_df.drop(columns=['row_id'])[len(orig_train_df):]

# valid_df = tv_df[tv_df['date'] > '2017-12-31']
# train_df = tv_df[tv_df['date'] <= '2017-12-31']

# train_and_valid_residual_df = train_and_valid_df.drop(columns=['date'])
# test_residual_df = test_df.drop(columns=['date'])

# len(valid_df) + len(train_df) == len(tv_df)

# encoded_tv_df

# Training

### Forecasting Models Prep
First, we'll set up functions to handle the training of forecasting models which will discern trends, and which may -- or may not -- yield insights concerning seasonality. While the Scikit-Learn models will be able to share a single trainer function, the Prophet and NeuralProphet models have subtly different expectations of their data, and as such will require separate handling.

In [33]:
from sklearn.linear_model import Ridge, HuberRegressor, LinearRegression, Lasso
from sklearn.neural_network import MLPRegressor
from prophet import Prophet
from neuralprophet import NeuralProphet
# earth? wouldn't install via pip on my machine at first

#### (Preprepared Preds)

The next cell contains code to import already-existing predictions -- but I think it's better to centralize the code that produces them here, and will comment out the import code for now.

In [34]:
# prophet_trainset = load(predpath/'20220121_prophet_baseline_trainset.joblib')

# neural_trainset = load(predpath/'20220121_neuralprophet_baseline_trainset.joblib')
# neural_test_preds = load(predpath/'20220121_neuralprophet_baseline_testset.joblib')

# ridge_tv_preds = load(predpath/'20210121_ridge_baseline_trainset_preds.joblib')
# ridge_test_preds = load(predpath/'20220121_ridge_testset_preds.joblib')

And this cell would handle the parsing

In [35]:
# neural_tv_preds = neural_trainset['prophet_forecast']
# prophet_tv_preds = prophet_trainset['prophet_forecast']

# neural_train_preds = neural_tv_preds[:train_length]
# neural_valid_preds = neural_tv_preds[train_length:]

# prophet_train_preds = prophet_tv_preds[:train_length]
# prophet_valid_preds = prophet_tv_preds[train_length:]

# train_length = len(neural_trainset[neural_trainset['date'] <= '2017-12-31'])

# ridge_train_preds = ridge_tv_preds[:train_length]
# ridge_valid_preds = ridge_tv_preds[train_length:]

#### Scikit-Learn Linear Models Prep

Linear models from Scikit-Learn seemingly require that datetime data be converted to numerics.

In [36]:
# train_linear_df = train_df.copy()
# valid_linear_df = valid_df.copy()
# test_linear_df = test_df.copy()
# tv_linear_df = tv_df.copy()



### Forecasters

#### Hyperparameters
I'll hard-code them for now, but in the future may Optuna them. May want to create a dict of all the kwargs to be used for all the models, with the model names as keys

In [37]:
prophet_kwargs = {
    'growth':'linear',
#     'holidays':holidays_train, # will add this in-function
    'n_changepoints':10,
    'changepoint_range':0.4,
    'yearly_seasonality':True,
    'weekly_seasonality':True,
    'daily_seasonality':False,
    'seasonality_mode':'additive',
    'seasonality_prior_scale':25,
    'holidays_prior_scale':100,
    'changepoint_prior_scale':0.01,
    'interval_width':0.5,
    'uncertainty_samples':False
}

neuralprophet_kwargs = {
    'growth':'linear',
    'n_changepoints':10,
    'changepoints_range':0.4,
    'trend_reg':1,
    'trend_reg_threshold':False,
    'yearly_seasonality':True,
    'weekly_seasonality':True,
    'daily_seasonality':False,
    'seasonality_mode':'additive',
    'seasonality_reg':1,
    'n_forecasts':365,
    'normalize':'off'
}

# model_params['hyperparams'] = str(neuralprophet_kwargs)
# model_params['holiday_source'] = 'Prophet builtin for each country'

#### Trainers

##### NeuralProphet
I'm leaving the folds as they are. ~~Label encoding shouldn't matter -- the values are just being iterated over anyway.~~ It does matter because the Prophets use the strings to identify countries' holidays to add. Not sure about doing the target transform -- if you try it, just have the trainer call pass `target='target'`.

In [38]:
prophet_folds = [
    ('2015-01-01', '2018-01-01'),
    ('2018-01-01', '2019-01-01'),
]

In [39]:
# prophet_tv_df = tv_df_encoded.copy() # encoded_tv_df.copy()
# prophet_test_df = test_df_encoded.copy() # encoded_test_df.copy()

In [40]:
# for feature in ['country', 'product', 'store']:
#     prophet_tv_df[feature] = orig_train_df[feature]
#     prophet_test_df[feature] = orig_test_df[feature]


In [41]:
# prophet_tv_df.head()

In [42]:
# countries_enc = le_dict['country'].transform(countries)
# stores_enc = le_dict['store'].transform(stores)
# products_enc = le_dict['product'].transform(products)

# countries, countries_enc

In [43]:
def neuralprophet_trainer(model_kwargs=neuralprophet_kwargs, countries=countries, stores=stores, products=products, folds=prophet_folds, 
                          tv_df=tv_df, test_df=test_df,
#                           df_train=tv_df, df_test=test_df, 
                          target='num_sold', wandb_tracked=False):
    train_smape = 0
    val_smape = 0
    
    # create local versions of the dataframes, to avoid mutation
    df_train = tv_df.copy()
    df_test = test_df.copy()
    
    if wandb_tracked:
#         exmodel_config['arch'] = arch
#         exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202201_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )
    
    # no label encoding here -- but test it with too
    for country in countries:
        for store in stores:
            for product in products:
                for fold, (start, end) in enumerate(folds):
                    # Skip iteration if it's the last fold
                    if fold == len(folds) - 1:
                        continue

                    # put only those rows in that are in the training window and have the correct country, store, and product
                    train_idx = (df_train['date'] >= start) &\
                                (df_train['date'] < end) &\
                                (df_train['country'] == country) &\
                                (df_train['store'] == store) &\
                                (df_train['product'] == product)

                    # redefine the training set in the local (holdout) sense
                    train = df_train.loc[train_idx, ['date', target]].reset_index(drop=True)

                    val_idx = (df_train['date'] >= folds[fold + 1][0]) &\
                              (df_train['date'] < folds[fold + 1][1]) &\
                              (df_train['country'] == country) &\
                              (df_train['store'] == store) &\
                              (df_train['product'] == product)

                    val = df_train.loc[val_idx, ['date', target]].reset_index(drop=True)

                    # rename the columns for standardization (this seems conventional)
                    train = train.rename(columns={'date': 'ds', target: 'y'})
                    val = val.rename(columns={'date': 'ds', target: 'y'})

#                     model = Prophet(**prophet_kwargs)
                    model = NeuralProphet(**model_kwargs)

                    model = model.add_country_holidays(country_name=country) # uses FacebookProphet or NeuralProphet API to add holidays
                    print(train.columns)
                    model.fit(train, freq='D') # neuralprophet
                    # prophet
#                     train_predictions = model.predict(train[['ds']])['yhat']
#                     val_predictions = model.predict(val[['ds']])['yhat']
                    # neuralprophet
                    train_predictions = model.predict(train)['yhat1']
                    val_predictions = model.predict(val)['yhat1']
                    df_train.loc[train_idx, 'neuralprophet_forecast'] = train_predictions.values
                    df_train.loc[val_idx, 'neuralprophet_forecast'] =  val_predictions.values

                    train_score = SMAPE(train['y'].values, train_predictions.values)
                    val_score = SMAPE(val['y'].values, val_predictions.values)
            
                    if wandb_tracked:
                        wandb.log({f"{(country,store,product)}_valid_smape": val_score})
            
                    train_smape += train_score
                    val_smape += val_score
            
                    print(f'\nTraining Range [{start}, {end}) - {country} - {store} - {product} - Train SMAPE: {train_score:4f}')
                    print(f'Validation Range [{folds[fold + 1][0]}, {folds[fold + 1][1]}) - {country} - {store} - {product} - Validation SMAPE: {val_score:4f}\n')

                    test_idx = (df_test['country'] == country) &\
                               (df_test['store'] == store) &\
                               (df_test['product'] == product)
                    test = df_test.loc[test_idx, ['date']].reset_index(drop=True)
                    
                    test = test.rename(columns={'date': 'ds'})
                    test['y'] = np.nan
                    test_predictions = model.predict(test)['yhat1']
                    
                    
                    df_test.loc[test_idx, 'neuralprophet_forecast'] = test_predictions.values
    
    train_smape /= (3*2*3)
    val_smape /= (3*2*3)
#     train_
    
    if wandb_tracked:
        wandb.log({'overall_train_smape': train_smape, 'overall_valid_smape': val_smape})
        wandb.finish()
    return df_train['neuralprophet_forecast'], df_test['neuralprophet_forecast']#, train_smape, val_smape

##### Prophet Trainer

In [44]:
def prophet_trainer(prophet_kwargs=prophet_kwargs, countries=countries, stores=stores, products=products, folds=prophet_folds, 
                    tv_df=tv_df, test_df=test_df,
#                           df_train=tv_df, df_test=test_df, 
                    target='num_sold', wandb_tracked=False):
    train_smape = 0
    val_smape = 0
    
    # create local versions of the dataframes, to avoid mutation
    df_train = tv_df.copy()
    df_test = test_df.copy()
    
    if wandb_tracked:
#         exmodel_config['arch'] = arch
#         exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202201_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )
    
    for country in countries:
        for store in stores:
            for product in products:
                for fold, (start, end) in enumerate(folds):
                    # Skip iteration if it's the last fold
                    if fold == len(folds) - 1:
                        continue

                    # put only those rows in that are in the training window and have the correct country, store, and product
                    train_idx = (df_train['date'] >= start) &\
                                (df_train['date'] < end) &\
                                (df_train['country'] == country) &\
                                (df_train['store'] == store) &\
                                (df_train['product'] == product)
                    
#                     print(train_idx)

                    # redefine the training set in the local (holdout) sense
                    train = df_train.loc[train_idx, ['date', target]].reset_index(drop=True)
#                     print(train.shape)

                    val_idx = (df_train['date'] >= folds[fold + 1][0]) &\
                              (df_train['date'] < folds[fold + 1][1]) &\
                              (df_train['country'] == country) &\
                              (df_train['store'] == store) &\
                              (df_train['product'] == product)

                    val = df_train.loc[val_idx, ['date', target]].reset_index(drop=True)

                    # rename the columns for standardization (this seems conventional)
                    train = train.rename(columns={'date': 'ds', target: 'y'})
                    val = val.rename(columns={'date': 'ds', target: 'y'})

                    model = Prophet(**prophet_kwargs)

                    model.add_country_holidays(country_name=country) # uses FacebookProphet API to add holidays
                    model.fit(train)
        
                    train_predictions = model.predict(train[['ds']])['yhat']
                    val_predictions = model.predict(val[['ds']])['yhat']
                    df_train.loc[train_idx, 'prophet_forecast'] = train_predictions.values
                    df_train.loc[val_idx, 'prophet_forecast'] =  val_predictions.values

                    train_score = SMAPE(train['y'].values, train_predictions.values)
                    val_score = SMAPE(val['y'].values, val_predictions.values)
            
                    if wandb_tracked:
                        wandb.log({f"{(country,store,product)}_valid_smape": val_score})
            
                    train_smape += train_score
                    val_smape += val_score
            
                    print(f'\nTraining Range [{start}, {end}) - {country} - {store} - {product} - Train SMAPE: {train_score:4f}')
                    print(f'Validation Range [{folds[fold + 1][0]}, {folds[fold + 1][1]}) - {country} - {store} - {product} - Validation SMAPE: {val_score:4f}\n')

                    test_idx = (df_test['country'] == country) &\
                               (df_test['store'] == store) &\
                               (df_test['product'] == product)
                    test = df_test.loc[test_idx, ['date']].reset_index(drop=True)
                    
                    test = test.rename(columns={'date': 'ds'})
                    test_predictions = model.predict(test[['ds']])['yhat']
                    
                    
                    df_test.loc[test_idx, 'prophet_forecast'] = test_predictions.values
    
    train_smape /= (3*2*3)
    val_smape /= (3*2*3)
    
    if wandb_tracked:
        wandb.log({'overall_train_smape': train_smape, 'overall_valid_smape': val_smape})
        wandb.finish()
    return df_train['prophet_forecast'], df_test['prophet_forecast']#, train_smape, val_smape

##### Scikit-Learn Models

In [45]:
def sklearn_trainer(estimator, model_kwargs={}, tv_df=tv_df, test_df=test_df, #X=X, y=y, X_valid=X_valid, y_valid=y_valid, X_test=X_test, 
                    folds=prophet_folds, countries=countries, stores=stores, products=products, target='target',
                    by_combo=False, wandb_tracked=False):
    
    # create local versions of the dataframes, to avoid mutation
    df_train = tv_df.copy()
    df_test = test_df.copy()
    
    # apply label encoding (which Scikit-Learn models require, but *Prophets don't)
    le_dict, tv_df = label_encoder(df_train) # should leave broader scope's tv_df alone
    _, test_df = label_encoder(df_test) # should leave broader scope's test_df alone
    del df_train, df_test
    
    # encode the lists of countries, stores, and products
    countries = le_dict['country'].transform(countries)
    stores = le_dict['store'].transform(stores)
    products = le_dict['product'].transform(products)
    
    train_smape = 0
    val_smape = 0
    
    if wandb_tracked:
#         exmodel_config['arch'] = arch
#         exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202201_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )
    
    # drop whichever version of the dependent variable is not being used
#     for df in [tv_df, test_df]:
    if target == 'num_sold': 
        tv_df = tv_df.drop(columns=['target'])
        test_df = test_df.drop(columns=['target'])
    else:
        tv_df = tv_df.drop(columns=['num_sold'])
        test_df = test_df.drop(columns=['num_sold'])
            
#     print("'num_sold' in test_df.columns == ", 'num_sold' in test_df.columns)
    
    if by_combo: # handling each combination of country, store, and product separately
        for country in countries:
            for store in stores:
                for product in products:
                    for fold, (start, end) in enumerate(folds):
                        # Skip iteration if it's the last fold
                        if fold == len(folds) - 1:
                            continue

                        # put only those rows in that are in the training window and have the correct country, store, and product
                        train_idx = (tv_df['date'] >= start) &\
                                    (tv_df['date'] < end) &\
                                    (tv_df['country'] == country) &\
                                    (tv_df['store'] == store) &\
                                    (tv_df['product'] == product)

    #                     print(train_idx)

                        # redefine the training set in the local (holdout) sense
                        train = tv_df.loc[train_idx, :].reset_index(drop=True)
#                         print(train.shape)

                        val_idx = (tv_df['date'] >= folds[fold + 1][0]) &\
                                  (tv_df['date'] < folds[fold + 1][1]) &\
                                  (tv_df['country'] == country) &\
                                  (tv_df['store'] == store) &\
                                  (tv_df['product'] == product)

                        val = tv_df.loc[val_idx, :].reset_index(drop=True)

                        test_idx = (test_df['country'] == country) &\
                                   (test_df['store'] == store) &\
                                   (test_df['product'] == product)
                        test = test_df.loc[test_idx, :].reset_index(drop=True)

                        # with the training and validation sets sorted out, make them integers for model fitting
                        for df in [train, val, test]:
                            df['date'] = df['date'].map(dt.datetime.toordinal)
                        if 'model_forecast' in train.columns:
                            X = train.drop(columns=[target, 'model_forecast'])
                            X_valid = val.drop(columns=[target, 'model_forecast'])
                            X_test = test.drop(columns=[target, 'model_forecast'])
                        else:
                            X = train.drop(columns=[target])
                            X_valid = val.drop(columns=[target])
                            X_test = test.drop(columns=[target])
                        
                        y = train[target]
                        y_valid = val[target]
                        
                        
#                         print(type(X), type(y))
#                         print(f"X has {X.isna().any().sum()} NaNs")
#                         print(f"y has {y.isna().sum()} NaNs")
                        
                        model = estimator(**model_kwargs)
                        model.fit(X,y)

                        model_train_preds = model.predict(X)
                        model_valid_preds = model.predict(X_valid)
                        model_test_preds = model.predict(X_test)

                        tv_df.loc[train_idx, 'model_forecast'] = model_train_preds#.values
                        tv_df.loc[val_idx, 'model_forecast'] =  model_valid_preds#.values
                        test_df.loc[test_idx, 'model_forecast'] = model_test_preds#.values
                    
        
    else: # don't separate out by combination of country, store, and product
        # split the dataset such that 2018 is the validation set
        train = tv_df[tv_df['date'] > '2017-12-31']
        valid = tv_df[tv_df['date'] <= '2017-12-31']
        test = test_df.copy()
#         print(test_df['date'].dtype)
#         print("NaNs after test instantiation: ", test.isna().sum())
    
        # convert the datetime objects to integers and then drop the transformed or untransformed target
            
        for df in [train, valid, test]:
            df['date'] = df['date'].map(dt.datetime.toordinal)

                
        # separate out the independent and dependent variables
        X = train.drop(columns=[target])
        y = train[target]
        X_valid = valid.drop(columns=[target])
        y_valid = valid[target]
        X_test = test.drop(columns=[target])
        
#         print("NaNs after X_test instantiation: ", X_test.isna().sum())
#         print("'num_sold' in X_test.columns == ", 'num_sold' in X_test.columns)
        
#         print(type(X), type(y))
        model = estimator(**model_kwargs)
        model.fit(X,y)
        
        # make predictions on both the training and validation set, and put them together
        model_train_preds = model.predict(X)
        model_valid_preds = model.predict(X_valid)
        model_tv_preds = pd.concat([pd.Series(model_train_preds), pd.Series(model_valid_preds)], axis=0, ignore_index=True)
#         print(len(model_tv_preds) == len(model_train_preds) + len(model_valid_preds))
#         print(f"Length of train preds is {len(model_train_preds)}, length of valid preds is {len(model_valid_preds)}, length of combined is {len(model_tv_preds)}")
        
        # make predictions on the test set
        model_test_preds = model.predict(X_test)
#         print("model_test_preds.shape == ", model_test_preds.shape)
        
#         print(tv_df.info())
    
#         output_tv_df = tv_df.copy()
#         output_test_df = test_df.copy()
#         output_tv_df['model_forecast'] = pd.Series(model_tv_preds)#.values
#         output_test_df['model_forecast'] = pd.Series(model_test_preds)#.values
        tv_df['model_forecast'] = model_tv_preds
        test_df['model_forecast'] = model_test_preds
        
    # reverse the dependent variable transform if appropriate
    if target == 'target':
#             model_tv_preds = np.multiply(np.exp(model_tv_preds), tv_df['gdp']**gdp_exponent)
        tv_df['model_forecast'] = np.exp(tv_df['model_forecast']) * tv_df['gdp']**gdp_exponent
#             output_tv_df['model_forecast'] = np.exp(output_tv_df['model_forecast']) * output_tv_df['gdp']**gdp_exponent

#             model_test_preds = np.multiply(np.exp(model_test_preds), test_df['gdp']**gdp_exponent)
        test_df['model_forecast'] = np.exp(test_df['model_forecast']) * test_df['gdp']**gdp_exponent
#             output_test_df['model_forecast'] = np.exp(output_test_df['model_forecast']) * output_test_df['gdp']**gdp_exponent
#             model_test_preds = np.exp(model_test_preds) * test_df['gdp']**gdp_exponent
        
#         tv_df['model_forecast'] = model_tv_preds
#         test_df['model_forecast'] = model_test_preds
#     return output_tv_df, output_test_df
    return tv_df['model_forecast'], test_df['model_forecast']
#     return tv_df['model_forecast'], test_df['model_forecast']
#     return model_tv_preds, model_test_preds
    

#### Calls

In [46]:
%time
prophet_tv_preds, prophet_test_preds = prophet_trainer(target='num_sold')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs

Training Range [2015-01-01, 2018-01-01) - Sweden - KaggleMart - Kaggle Mug - Train SMAPE: 6.815541
Validation Range [2018-01-01, 2019-01-01) - Sweden - KaggleMart - Kaggle Mug - Validation SMAPE: 8.011073


Training Range [2015-01-01, 2018-01-01) - Sweden - KaggleMart - Kaggle Hat - Train SMAPE: 7.210369
Validation Range [2018-01-01, 2019-01-01) - Sweden - KaggleMart - Kaggle Hat - Validation SMAPE: 7.470548


Training Range [2015-01-01, 2018-01-01) - Sweden - KaggleMart - Kaggle Sticker - Train SMAPE: 7.056273
Validation Range [2018-01-01, 2019-01-01) - Sweden - KaggleMart - Kaggle Sticker - Validation SMAPE: 7.264969


Training Range [2015-01-01, 2018-01-01) - Sweden - KaggleRama - Kaggle Mug - Train SMAPE: 6.716048
Validation Range [2018-01-01, 2019-01-01) - Sweden - KaggleRama - Kaggle Mug - Validation SMAPE: 7.406634


Training Range [2015-01-01, 2018-01-01) - Sweden - KaggleRama - Kaggle Hat - Train SMAPE: 7.000068


In [47]:
%time
neural_tv_preds, neural_test_preds = neuralprophet_trainer(target='num_sold')

INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 2.86 µs
Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.77E+01, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.77E+01, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.08E+01, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 2.08E+01, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.75E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.75E+00, min: 6.62E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 8.63E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 8.63E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 31.77it/s, SmoothL1Loss=16.7, MAE=17.2, RMSE=31.8, RegLoss=0.349] 
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176



Training Range [2015-01-01, 2018-01-01) - Sweden - KaggleMart - Kaggle Mug - Train SMAPE: 6.787962
Validation Range [2018-01-01, 2019-01-01) - Sweden - KaggleMart - Kaggle Mug - Validation SMAPE: 6.350349

Index(['ds', 'y'], dtype='object')


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 4.38E+01, min: 6.04E+00
INFO:NP.utils_torch:lr-range-test results: steep: 4.38E+01, min: 6.04E+00


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.87E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 2.87E+00, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 3.71E+01, min: 6.04E+00
INFO:NP.utils_torch:lr-range-test results: steep: 3.71E+01, min: 6.04E+00
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 1.67E+01
INFO:NP.forecaster:lr-range-test selected learning rate: 1.67E+01
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 32.39it/s, SmoothL1Loss=31.9, MAE=32.4, RMSE=56.7, RegLoss=0.628] 
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176



Training Range [2015-01-01, 2018-01-01) - Sweden - KaggleMart - Kaggle Hat - Train SMAPE: 7.182275
Validation Range [2018-01-01, 2019-01-01) - Sweden - KaggleMart - Kaggle Hat - Validation SMAPE: 6.327551

Index(['ds', 'y'], dtype='object')


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.27E+01, min: 6.09E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.27E+01, min: 6.09E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 9.91E+00, min: 5.61E+01
INFO:NP.utils_torch:lr-range-test results: steep: 9.91E+00, min: 5.61E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 9.04E-01, min: 5.17E+01
INFO:NP.utils_torch:lr-range-test results: steep: 9.04E-01, min: 5.17E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 4.84E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 4.84E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 32.46it/s, SmoothL1Loss=7.88, MAE=8.36, RMSE=14.7, RegLoss=0.127]  
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176



Training Range [2015-01-01, 2018-01-01) - Sweden - KaggleMart - Kaggle Sticker - Train SMAPE: 6.704859
Validation Range [2018-01-01, 2019-01-01) - Sweden - KaggleMart - Kaggle Sticker - Validation SMAPE: 6.226494

Index(['ds', 'y'], dtype='object')


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 3.12E+00, min: 5.56E+00
INFO:NP.utils_torch:lr-range-test results: steep: 3.12E+00, min: 5.56E+00


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 3.12E+00, min: 5.56E+00
INFO:NP.utils_torch:lr-range-test results: steep: 3.12E+00, min: 5.56E+00


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 3.42E+01, min: 5.56E+00
INFO:NP.utils_torch:lr-range-test results: steep: 3.42E+01, min: 5.56E+00
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 6.93E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 6.93E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 31.68it/s, SmoothL1Loss=29.9, MAE=30.4, RMSE=56.2, RegLoss=0.597] 
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176



Training Range [2015-01-01, 2018-01-01) - Sweden - KaggleRama - Kaggle Mug - Train SMAPE: 6.883841
Validation Range [2018-01-01, 2019-01-01) - Sweden - KaggleRama - Kaggle Mug - Validation SMAPE: 6.699610

Index(['ds', 'y'], dtype='object')


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 5.12E+00, min: 1.08E+01
INFO:NP.utils_torch:lr-range-test results: steep: 5.12E+00, min: 1.08E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 5.12E+00, min: 1.08E+01
INFO:NP.utils_torch:lr-range-test results: steep: 5.12E+00, min: 1.08E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 5.12E+00, min: 1.17E+01
INFO:NP.utils_torch:lr-range-test results: steep: 5.12E+00, min: 1.17E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 5.12E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 5.12E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 31.54it/s, SmoothL1Loss=57.4, MAE=57.9, RMSE=103, RegLoss=1.15]  
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176



Training Range [2015-01-01, 2018-01-01) - Sweden - KaggleRama - Kaggle Hat - Train SMAPE: 7.433342
Validation Range [2018-01-01, 2019-01-01) - Sweden - KaggleRama - Kaggle Hat - Validation SMAPE: 6.897087

Index(['ds', 'y'], dtype='object')


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.77E+01, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.77E+01, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.61E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.61E+00, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.61E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.61E+00, min: 6.62E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 3.58E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 3.58E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 31.28it/s, SmoothL1Loss=13.9, MAE=14.3, RMSE=25.8, RegLoss=0.252] 
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]


Training Range [2015-01-01, 2018-01-01) - Sweden - KaggleRama - Kaggle Sticker - Train SMAPE: 6.566485
Validation Range [2018-01-01, 2019-01-01) - Sweden - KaggleRama - Kaggle Sticker - Validation SMAPE: 6.482125

Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.61E+00, min: 4.76E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.61E+00, min: 4.76E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.92E+01, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.92E+01, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.61E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.61E+00, min: 6.62E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 3.68E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 3.68E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 30.91it/s, SmoothL1Loss=14.6, MAE=15.1, RMSE=28.7, RegLoss=0.349] 
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]


Training Range [2015-01-01, 2018-01-01) - Finland - KaggleMart - Kaggle Mug - Train SMAPE: 7.017518
Validation Range [2018-01-01, 2019-01-01) - Finland - KaggleMart - Kaggle Mug - Validation SMAPE: 8.584579

Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 3.15E+01, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 3.15E+01, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.90E+01, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 2.90E+01, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.64E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 2.64E+00, min: 6.62E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 1.34E+01
INFO:NP.forecaster:lr-range-test selected learning rate: 1.34E+01
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 32.04it/s, SmoothL1Loss=26.1, MAE=26.6, RMSE=47.2, RegLoss=0.561] 
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176



Training Range [2015-01-01, 2018-01-01) - Finland - KaggleMart - Kaggle Hat - Train SMAPE: 6.985428
Validation Range [2018-01-01, 2019-01-01) - Finland - KaggleMart - Kaggle Hat - Validation SMAPE: 8.460387

Index(['ds', 'y'], dtype='object')


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 8.32E-01, min: 4.76E+01
INFO:NP.utils_torch:lr-range-test results: steep: 8.32E-01, min: 4.76E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 7.66E-01, min: 4.76E+01
INFO:NP.utils_torch:lr-range-test results: steep: 7.66E-01, min: 4.76E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 8.32E-01, min: 6.09E+01
INFO:NP.utils_torch:lr-range-test results: steep: 8.32E-01, min: 6.09E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 8.10E-01
INFO:NP.forecaster:lr-range-test selected learning rate: 8.10E-01
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 31.55it/s, SmoothL1Loss=6.43, MAE=6.9, RMSE=13.1, RegLoss=0.169]   
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]


Training Range [2015-01-01, 2018-01-01) - Finland - KaggleMart - Kaggle Sticker - Train SMAPE: 6.463170
Validation Range [2018-01-01, 2019-01-01) - Finland - KaggleMart - Kaggle Sticker - Validation SMAPE: 7.364813

Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 3.15E+01, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 3.15E+01, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.64E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 2.64E+00, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.64E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 2.64E+00, min: 6.62E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 6.04E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 6.04E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 31.06it/s, SmoothL1Loss=25.1, MAE=25.6, RMSE=49.2, RegLoss=0.547] 
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]


Training Range [2015-01-01, 2018-01-01) - Finland - KaggleRama - Kaggle Mug - Train SMAPE: 6.799852
Validation Range [2018-01-01, 2019-01-01) - Finland - KaggleRama - Kaggle Mug - Validation SMAPE: 8.443204

Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 4.34E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 4.34E+00, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 4.34E+00, min: 9.91E+00
INFO:NP.utils_torch:lr-range-test results: steep: 4.34E+00, min: 9.91E+00


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 4.34E+00, min: 9.91E+00
INFO:NP.utils_torch:lr-range-test results: steep: 4.34E+00, min: 9.91E+00
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 4.34E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 4.34E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 31.05it/s, SmoothL1Loss=47.1, MAE=47.6, RMSE=83.4, RegLoss=1.09]  
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]


Training Range [2015-01-01, 2018-01-01) - Finland - KaggleRama - Kaggle Hat - Train SMAPE: 7.236571
Validation Range [2018-01-01, 2019-01-01) - Finland - KaggleRama - Kaggle Hat - Validation SMAPE: 8.263741

Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.37E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.37E+00, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.63E+01, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.63E+01, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.48E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.48E+00, min: 6.62E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 3.21E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 3.21E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 30.07it/s, SmoothL1Loss=12.3, MAE=12.7, RMSE=23.2, RegLoss=0.289] 
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]


Training Range [2015-01-01, 2018-01-01) - Finland - KaggleRama - Kaggle Sticker - Train SMAPE: 6.841822
Validation Range [2018-01-01, 2019-01-01) - Finland - KaggleRama - Kaggle Sticker - Validation SMAPE: 7.555420

Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.43E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 2.43E+00, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.90E+01, min: 4.34E+00
INFO:NP.utils_torch:lr-range-test results: steep: 2.90E+01, min: 4.34E+00


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.43E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 2.43E+00, min: 6.62E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 5.56E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 5.56E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 30.29it/s, SmoothL1Loss=28.9, MAE=29.4, RMSE=49, RegLoss=0.656]   
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]


Training Range [2015-01-01, 2018-01-01) - Norway - KaggleMart - Kaggle Mug - Train SMAPE: 8.572676
Validation Range [2018-01-01, 2019-01-01) - Norway - KaggleMart - Kaggle Mug - Validation SMAPE: 14.630094

Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 4.00E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 4.00E+00, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 5.61E+01, min: 9.12E+00
INFO:NP.utils_torch:lr-range-test results: steep: 5.61E+01, min: 9.12E+00


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 4.34E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 4.34E+00, min: 6.62E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 9.91E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 9.91E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 30.85it/s, SmoothL1Loss=52.7, MAE=53.1, RMSE=87, RegLoss=1.17]    
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]


Training Range [2015-01-01, 2018-01-01) - Norway - KaggleMart - Kaggle Hat - Train SMAPE: 8.632346
Validation Range [2018-01-01, 2019-01-01) - Norway - KaggleMart - Kaggle Hat - Validation SMAPE: 13.595614

Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.27E+01, min: 5.17E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.27E+01, min: 5.17E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.37E+00, min: 6.09E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.37E+00, min: 6.09E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 1.26E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 1.26E+00, min: 6.62E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 2.79E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 2.79E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 31.27it/s, SmoothL1Loss=13.2, MAE=13.7, RMSE=23, RegLoss=0.382]   
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]


Training Range [2015-01-01, 2018-01-01) - Norway - KaggleMart - Kaggle Sticker - Train SMAPE: 8.025111
Validation Range [2018-01-01, 2019-01-01) - Norway - KaggleMart - Kaggle Sticker - Validation SMAPE: 14.149867

Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 4.34E+00, min: 7.73E+00
INFO:NP.utils_torch:lr-range-test results: steep: 4.34E+00, min: 7.73E+00


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 4.34E+00, min: 8.40E+00
INFO:NP.utils_torch:lr-range-test results: steep: 4.34E+00, min: 8.40E+00


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 4.34E+00, min: 8.40E+00
INFO:NP.utils_torch:lr-range-test results: steep: 4.34E+00, min: 8.40E+00
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 4.34E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 4.34E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 30.51it/s, SmoothL1Loss=51.4, MAE=51.9, RMSE=86.6, RegLoss=1.21]  
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]


Training Range [2015-01-01, 2018-01-01) - Norway - KaggleRama - Kaggle Mug - Train SMAPE: 8.800466
Validation Range [2018-01-01, 2019-01-01) - Norway - KaggleRama - Kaggle Mug - Validation SMAPE: 12.493967

Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 7.12E+00, min: 1.63E+01
INFO:NP.utils_torch:lr-range-test results: steep: 7.12E+00, min: 1.63E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 7.12E+00, min: 1.63E+01
INFO:NP.utils_torch:lr-range-test results: steep: 7.12E+00, min: 1.63E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 7.12E+00, min: 1.63E+01
INFO:NP.utils_torch:lr-range-test results: steep: 7.12E+00, min: 1.63E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 7.12E+00
INFO:NP.forecaster:lr-range-test selected learning rate: 7.12E+00
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 30.39it/s, SmoothL1Loss=109, MAE=109, RMSE=174, RegLoss=2]     
INFO - (NP.config.__post_init__) - Note: Trend changepoint regularization is experimental.
INFO:NP.config:Note: Trend changepoint regularization is experimental.
INFO - (NP.config.__post_init__) - Note: Fourier-based seasonality regularization is experimental.
INFO:NP.config:Note: Fourier-based seasonality regularization is experimental.
INFO - (NP.config.set_auto_batch_epoch) - Auto-set batch_size to 32
INFO:NP.config:Auto-set batch_size to 32
INFO - (NP.config.set_auto_batch_epoch) - Auto-set epochs to 176
INFO:NP.config:Auto-set epochs to 176


  0%|          | 0/252 [00:00<?, ?it/s]


Training Range [2015-01-01, 2018-01-01) - Norway - KaggleRama - Kaggle Hat - Train SMAPE: 10.737415
Validation Range [2018-01-01, 2019-01-01) - Norway - KaggleRama - Kaggle Hat - Validation SMAPE: 8.430614

Index(['ds', 'y'], dtype='object')


INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.67E+01, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 2.67E+01, min: 6.62E+01


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.46E+01, min: 4.00E+00
INFO:NP.utils_torch:lr-range-test results: steep: 2.46E+01, min: 4.00E+00


  0%|          | 0/252 [00:00<?, ?it/s]

INFO - (NP.utils_torch.lr_range_test) - lr-range-test results: steep: 2.24E+00, min: 6.62E+01
INFO:NP.utils_torch:lr-range-test results: steep: 2.24E+00, min: 6.62E+01
INFO - (NP.forecaster._init_train_loader) - lr-range-test selected learning rate: 1.14E+01
INFO:NP.forecaster:lr-range-test selected learning rate: 1.14E+01
Epoch[176/176]: 100%|██████████| 176/176 [00:05<00:00, 30.58it/s, SmoothL1Loss=23.6, MAE=24, RMSE=39.7, RegLoss=0.535]   



Training Range [2015-01-01, 2018-01-01) - Norway - KaggleRama - Kaggle Sticker - Train SMAPE: 8.070777
Validation Range [2018-01-01, 2019-01-01) - Norway - KaggleRama - Kaggle Sticker - Validation SMAPE: 12.557456



In [48]:
# %time
# ridge_tv_preds, ridge_test_preds = sklearn_trainer(estimator=Ridge)

In [49]:
%time
ridge_combo_tv_preds, ridge_combo_test_preds = sklearn_trainer(estimator=Ridge, by_combo=True)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs


In [50]:
# linear_tv_preds, linear_test_preds = sklearn_trainer(estimator=LinearRegression)

In [51]:
linear_combo_tv_preds, linear_combo_test_preds = sklearn_trainer(estimator=LinearRegression, by_combo=True)

In [52]:
huber_combo_tv_preds, huber_combo_test_preds = sklearn_trainer(estimator=HuberRegressor, by_combo=True)

In [53]:
mlp_combo_tv_preds, mlp_combo_test_preds = sklearn_trainer(estimator=MLPRegressor, by_combo=True)

In [54]:
lasso_combo_tv_preds, lasso_combo_test_preds = sklearn_trainer(estimator=Lasso, by_combo=True)

In [55]:
y_tv = orig_train_df['num_sold']

In [57]:
for preds in [prophet_tv_preds, ridge_combo_tv_preds, linear_combo_tv_preds, huber_combo_tv_preds, mlp_combo_tv_preds, neural_tv_preds, lasso_combo_tv_preds]:
    print(SMAPE(y_pred=preds, y_true=y_tv))

7.098824805150043
4.781041939812455
4.175573461450916
14.396222549610735
90.78388674222738
7.926662675729486
15.11713989060885


Clearly, doing it by combination is the way to go. I think Ridge and LinearRegressor are definitely good to use; Prophet and NeuralProphet are worth including as well. Huber, probably not; MLPRegressor, definitely not barring the discovery of better hyperparams.

### Forecast Bundling
Now, create an iterable collection consisting of all the forecaster predictions.

In [58]:
forecast_tv_preds = {
    'prophet': prophet_tv_preds,
    'neuralprophet': neural_tv_preds,
    'ridge': ridge_combo_tv_preds,
    'linear': linear_combo_tv_preds,
    'huber': huber_combo_tv_preds,
    'lasso': lasso_combo_tv_preds,
}

forecast_test_preds = {
    'prophet': prophet_test_preds,
    'neuralprophet': neural_test_preds,
    'ridge': ridge_combo_test_preds,
    'linear': linear_combo_test_preds,
    'huber': huber_combo_test_preds,
    'lasso': lasso_combo_test_preds,
}

In [59]:
tv_preds = pd.DataFrame({
    'date': orig_train_df['date'],
    'num_sold': orig_train_df['num_sold'],
    **forecast_tv_preds
})

In [60]:
tv_preds

Unnamed: 0,date,num_sold,prophet,neuralprophet,ridge,linear,huber,lasso
0,2015-01-01,329,346.560416,329.134521,283.272258,324.188332,239.868861,176.381725
1,2015-01-01,520,536.203586,458.008301,439.771734,503.081569,334.618687,334.453055
2,2015-01-01,146,143.412803,145.325577,120.885683,136.667932,96.086063,89.802390
3,2015-01-01,572,590.117165,552.571167,488.810710,555.531957,330.014790,307.788806
4,2015-01-01,911,939.673009,781.967285,771.701702,890.469201,875.922092,583.510103
...,...,...,...,...,...,...,...,...
26293,2018-12-31,823,898.322121,669.418457,723.714830,846.597603,407.206130,397.285794
26294,2018-12-31,250,253.512355,227.158142,205.880079,241.048951,132.620712,126.765608
26295,2018-12-31,1004,1039.635205,715.639648,832.192362,975.785339,401.224848,430.368706
26296,2018-12-31,1441,1526.908216,980.234009,1255.885410,1468.776593,863.230649,689.775798


In [61]:
valid_forecast_preds = tv_preds[tv_preds['date'] > '2017-12-31']
valid_forecast_preds

Unnamed: 0,date,num_sold,prophet,neuralprophet,ridge,linear,huber,lasso
19728,2018-01-01,405,383.365770,374.809906,345.296498,417.388135,249.679981,211.676443
19729,2018-01-01,621,602.594228,539.082520,536.708119,637.949409,348.560475,348.732376
19730,2018-01-01,176,161.735807,167.919098,147.203184,174.086997,99.898973,107.780206
19731,2018-01-01,714,656.201669,635.251648,596.599377,713.788713,343.757663,368.614186
19732,2018-01-01,1043,1044.903702,940.582886,930.576635,1110.805235,913.849095,604.430617
...,...,...,...,...,...,...,...,...
26293,2018-12-31,823,898.322121,669.418457,723.714830,846.597603,407.206130,397.285794
26294,2018-12-31,250,253.512355,227.158142,205.880079,241.048951,132.620712,126.765608
26295,2018-12-31,1004,1039.635205,715.639648,832.192362,975.785339,401.224848,430.368706
26296,2018-12-31,1441,1526.908216,980.234009,1255.885410,1468.776593,863.230649,689.775798


In [62]:
valid_forecast_preds.corr()

Unnamed: 0,num_sold,prophet,neuralprophet,ridge,linear,huber,lasso
num_sold,1.0,0.972938,0.96144,0.992806,0.995334,0.910517,0.899862
prophet,0.972938,1.0,0.990336,0.985617,0.975158,0.937816,0.939803
neuralprophet,0.96144,0.990336,1.0,0.978596,0.962073,0.947772,0.950331
ridge,0.992806,0.985617,0.978596,1.0,0.996412,0.926965,0.919398
linear,0.995334,0.975158,0.962073,0.996412,1.0,0.910123,0.901341
huber,0.910517,0.937816,0.947772,0.926965,0.910123,1.0,0.922899
lasso,0.899862,0.939803,0.950331,0.919398,0.901341,0.922899,1.0


In [63]:
tv_preds.corr()

Unnamed: 0,num_sold,prophet,neuralprophet,ridge,linear,huber,lasso
num_sold,1.0,0.975093,0.960082,0.99311,0.995846,0.907911,0.899352
prophet,0.975093,1.0,0.985693,0.986481,0.977785,0.933401,0.92766
neuralprophet,0.960082,0.985693,1.0,0.97801,0.962507,0.938389,0.939915
ridge,0.99311,0.986481,0.97801,1.0,0.996573,0.926616,0.919336
linear,0.995846,0.977785,0.962507,0.996573,1.0,0.909135,0.900973
huber,0.907911,0.933401,0.938389,0.926616,0.909135,1.0,0.923215
lasso,0.899352,0.92766,0.939915,0.919336,0.900973,0.923215,1.0


So Ridge and Linear perform best, and are quite similar; Prophet and NeuralProphet are next best, and similar to one another; Huber is an outlier (and not so good). Lasso performs worst of all, but is closer to Prophet and NeuralProphet in performance than the others. It's closest of all to NeuralProphet, interestingly.

In [64]:
test_forecast_preds = pd.DataFrame({
    'date': orig_test_df['date'],
#     'num_sold': orig_train_df['num_sold'],
    **forecast_test_preds
})

In [65]:
test_forecast_preds

Unnamed: 0,date,prophet,neuralprophet,ridge,linear,huber,lasso
0,2019-01-01,402.296576,383.543457,359.817081,392.328390,248.728792,221.118411
1,2019-01-01,631.642099,551.044983,560.783324,620.092467,347.337935,347.623611
2,2019-01-01,169.992094,173.065674,153.552940,166.788184,99.483226,112.590591
3,2019-01-01,684.559821,648.783691,622.237644,675.674104,342.549613,384.790895
4,2019-01-01,1092.875362,965.683716,969.065554,1084.980801,911.102831,601.189400
...,...,...,...,...,...,...,...
6565,2019-12-31,917.085861,674.891174,742.331990,797.964290,404.876685,391.134492
6566,2019-12-31,261.569590,232.082169,210.920805,225.576610,131.774790,129.930008
6567,2019-12-31,1065.400121,730.457275,852.091174,925.788409,398.926697,440.077496
6568,2019-12-31,1568.407214,1005.392944,1286.814471,1397.071983,858.595973,677.887045


### Residuals

At this point, I have two DataFrames containing predictions from the forecasting models (which try to learn trends): 
1. `valid_forecast_preds`
2. `test_forecast_preds`

Both still contain the features `'date'` (having a datetime type) and the validation preds contain `num_sold`.

The goal now will be to iteratively generate a final prediction DataFrame containing all possible combinations of the forecasting model predictions and residual predictions.

In [66]:
forecast_models = ['prophet', 'neuralprophet', 'ridge', 'linear', 'huber', 'lasso'] # models to use to provide basis for residual predictions

In [67]:
# encoded_all_df

In [68]:
# dump(encoded_all_df, datapath/'encoded_train+testset_with_gdp+teckmengwong-time-features+transformed-target-for-train.joblib')

In [69]:
# hybrid_train_df = encoded_all_df[:len(train_df)]
# hybrid_valid_df = encoded_all_df[len(train_df): len(train_df)+len(valid_df)]

In [70]:
# hybrid_test_df = encoded_all_df[len(train_df)+len(valid_df):]

In [71]:
# hybrid_valid_df

Note that the `tv_df` still contains both transformed targets and `num_sold`.

In [72]:
tv_df

Unnamed: 0,date,country,store,product,num_sold,gdp,month,season,wd4,wd56,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329.0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,3.738239
1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520.0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,4.196010
2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146.0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,2.925788
3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572.0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,4.291321
4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911.0,5.461456,1,1,False,False,...,False,False,False,False,False,False,False,False,False,4.756724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26293,2018-12-31,Sweden,KaggleMart,Kaggle Hat,823.0,6.321586,12,1,False,False,...,False,False,False,False,False,False,False,False,False,4.477861
26294,2018-12-31,Sweden,KaggleMart,Kaggle Sticker,250.0,6.321586,12,1,False,False,...,False,False,False,False,False,False,False,False,False,3.286366
26295,2018-12-31,Sweden,KaggleRama,Kaggle Mug,1004.0,6.321586,12,1,False,False,...,False,False,False,False,False,False,False,False,False,4.676652
26296,2018-12-31,Sweden,KaggleRama,Kaggle Hat,1441.0,6.321586,12,1,False,False,...,False,False,False,False,False,False,False,False,False,5.037997


In [73]:
test_df

Unnamed: 0,date,country,store,product,num_sold,gdp,month,season,wd4,wd56,...,easter50,easter51,easter52,easter53,easter54,easter55,easter56,easter57,easter58,target
0,2019-01-01,Finland,KaggleMart,Kaggle Mug,,5.597614,1,1,False,False,...,False,False,False,False,False,False,False,False,False,
1,2019-01-01,Finland,KaggleMart,Kaggle Hat,,5.597614,1,1,False,False,...,False,False,False,False,False,False,False,False,False,
2,2019-01-01,Finland,KaggleMart,Kaggle Sticker,,5.597614,1,1,False,False,...,False,False,False,False,False,False,False,False,False,
3,2019-01-01,Finland,KaggleRama,Kaggle Mug,,5.597614,1,1,False,False,...,False,False,False,False,False,False,False,False,False,
4,2019-01-01,Finland,KaggleRama,Kaggle Hat,,5.597614,1,1,False,False,...,False,False,False,False,False,False,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,2019-12-31,Sweden,KaggleMart,Kaggle Hat,,6.282042,12,1,False,False,...,False,False,False,False,False,False,False,False,False,
6566,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,,6.282042,12,1,False,False,...,False,False,False,False,False,False,False,False,False,
6567,2019-12-31,Sweden,KaggleRama,Kaggle Mug,,6.282042,12,1,False,False,...,False,False,False,False,False,False,False,False,False,
6568,2019-12-31,Sweden,KaggleRama,Kaggle Hat,,6.282042,12,1,False,False,...,False,False,False,False,False,False,False,False,False,


One question: should the full `tv_df` be passed as data, or only the validation set? I think it's better to supply as much data as possible, so I'll use the whole thing for now.

In [74]:
from sklearn.model_selection import GroupKFold

In [75]:
GroupKFold?

[0;31mInit signature:[0m [0mGroupKFold[0m[0;34m([0m[0mn_splits[0m[0;34m=[0m[0;36m5[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
K-fold iterator variant with non-overlapping groups.

The same group will not appear in two different folds (the number of
distinct groups has to be at least equal to the number of folds).

The folds are approximately balanced in the sense that the number of
distinct groups is approximately the same in each fold.

Read more in the :ref:`User Guide <group_k_fold>`.

Parameters
----------
n_splits : int, default=5
    Number of folds. Must be at least 2.

    .. versionchanged:: 0.22
        ``n_splits`` default value changed from 3 to 5.

Examples
--------
>>> import numpy as np
>>> from sklearn.model_selection import GroupKFold
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
>>> y = np.array([1, 2, 3, 4])
>>> groups = np.array([0, 0, 2, 2])
>>> group_kfold = GroupKFold(n_splits=2)
>>> group_kfold.get_n_splits(X, y, groups)

In [76]:
len(tv_df)

26298

In [77]:
len(orig_train_df)

26298

In [78]:
from sklearn.metrics import mean_squared_error

In [79]:
import math

In [80]:
def residual_trainer(estimator, model_kwargs={}, forecast_models=forecast_models,
                     tv_df=tv_df, test_df=test_df,
                     tv_forecast_preds=tv_preds, test_forecast_preds=test_forecast_preds):
#     df_2018 = tv_df[tv_df['date'] > '2017-12-31']
    # create local versions of the dataframes, to avoid mutation
    df_train = tv_df.copy()
    df_test = test_df.copy()
    
    # apply label encoding (which Scikit-Learn models require, but *Prophets don't)
    le_dict, tv_df = label_encoder(df_train) # should leave broader scope's tv_df alone
    _, test_df = label_encoder(df_test) # should leave broader scope's test_df alone
    del df_train, df_test
    
    residual_tv_df = pd.DataFrame({
        'date': orig_train_df['date'],
        'num_sold': orig_train_df['num_sold']
    })
    print(len(residual_tv_df))
    
    residual_test_df = pd.DataFrame({
        'date': orig_test_df['date'],
    })
    
    # getting rid of unneeded 'target' feature, also num_sold since we're only interested in predicting the residual
    # following @ambrosm lightgbm nb and leaving date in for GroupKFold, but unsure of this (or if I should drop it later)
    tv_df = tv_df.drop(columns=['target', 'num_sold'])#, 'date'])
    test_df = test_df.drop(columns=['target', 'num_sold'])#, 'date'])
    
    kfolds= GroupKFold(n_splits=4)
    
    test_fold_preds = {}
    
    for forecast_model in forecast_models:
        print(f"Working with forecasts from {forecast_model}...")
        print("-----------------------------------------------------")
        forecast = tv_forecast_preds[forecast_model] # pull out the predictions on the t-v sets for a given estimator
        residuals = residual_tv_df['num_sold'] - forecast # get the residuals for the given model's forecast
#         residual_df[f'actual_{forecast_model}_residual'] = residuals # may not need to put this in there
#         tv_df['residual'] = residuals # residuals will rotate in and out of this feature
        
        
        X_test = test_df.drop(columns=['date'])
#         print("y.shape is ", y.shape)
        
        # prepare for Group K-Fold cross-val; below from @ambrosm LightGBM notebook
        oof_preds = pd.Series(0, index=tv_df.index)
#         test_preds_df = pd.DataFrame({
#             'date': test_df['date']
#         })
        score_list = []
#         params['seed'] = 1
        for fold, (train_idx, val_idx) in enumerate(kfolds.split(tv_df, groups=tv_df.date.dt.year)):
            print("-----------------------------------------------------")
            print(f"FOLD {fold}")
            X = tv_df.iloc[train_idx].drop(columns=['date'])
            y = residuals.iloc[train_idx]#['date']
            X_valid = tv_df.iloc[val_idx].drop(columns=['date'])
            y_valid = residuals.iloc[val_idx]#['date']
            
            model = estimator(**model_kwargs)
            model.fit(X, y)
            
            residual_valid_preds = model.predict(X_valid)
            residual_test_preds = model.predict(X_test)
            
            oof_preds[val_idx] = residual_valid_preds
#             test_preds_df[f'{forecast_model}_fold_{fold}_preds'] = residual_test_preds
#             residual_test_df = residual_test_df.join(test_preds_df)
#             smape = SMAPE(y_pred=residual_valid_preds, y_true=y_valid.values)
#             print(f"SMAPE: {smape}")
            rmse = math.sqrt(mean_squared_error(y_pred=residual_valid_preds, y_true=y_valid.values))
            print(f"RMSE: {rmse}")
            test_fold_preds[f'{forecast_model}_{fold}_residual_preds'] = residual_test_preds
        residual_tv_df[f'{forecast_model}_oof_residual_preds'] = oof_preds
#         residual_test_df = residual_test_df.join(test_preds_df)
    return residual_tv_df, test_fold_preds #residual_test_df
        
            
            
#             model, smape = fit_model(X_tr, X_va, run=0, fold=fold)

# print(f"Average SMAPE: {sum(score_list) / len(score_list):.5f}")
# with open('oof.pickle', 'wb') as handle: pickle.dump(oof, handle)
#         X = tv_df.drop(columns=['residual'])
#         y = tv_df['residual']
        
#         tv_
    
    
    
        
#     y_valid = valid_df['num_sold']
#     valid_df = valid_df.drop(columns=['target'])

In [81]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [93]:
xgboost_params = {
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'eval_metric': ['mae', 'mape', 'rmse'],
    'learning_rate': .09,
    'max_depth': 0,
    'subsample': .15,
    'sampling_method': 'gradient_based',
    'seed': 42,
    'grow_policy': 'lossguide',
    'max_leaves': 255,
    'lambda': 100,
    'n_estimators': 3000,
    'objective': 'reg:squarederror',
#     'n_estimators': 500
#     'verbose': True,
}


lightgbm_params = {
    'objective': 'mse',
    'random_state': 42,
    'device_type': 'cpu',
    'n_jobs': -1,
#                 eval_metric='auc',
#     'device_type': 'gpu',
#     'max_bin': 63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
#     'gpu_use_dp': False,
    'max_depth': 0,
    'learning_rate': 0.1,
    'subsample': .15,
    'n_estimators': 1500,
}

catboost_params = {
    'task_type':'GPU',
    'silent':True,
    'random_state':42,
}
                

XGBoost SMAPEs with `n_estimators=500`:

```
26298
Working with forecasts from prophet...
-----------------------------------------------------
-----------------------------------------------------
FOLD 0
RMSE: 44.22169948290956
-----------------------------------------------------
FOLD 1
RMSE: 42.2884194618039
-----------------------------------------------------
FOLD 2
RMSE: 38.87851134388903
-----------------------------------------------------
FOLD 3
RMSE: 39.14401892347087
Working with forecasts from neuralprophet...
-----------------------------------------------------
-----------------------------------------------------
FOLD 0
RMSE: 71.59711327241503
-----------------------------------------------------
FOLD 1
RMSE: 50.410812085025206
-----------------------------------------------------
FOLD 2
RMSE: 56.30623610755547
-----------------------------------------------------
FOLD 3
RMSE: 71.87002649034811
Working with forecasts from ridge...
-----------------------------------------------------
-----------------------------------------------------
FOLD 0
RMSE: 29.951196797722172
-----------------------------------------------------
FOLD 1
RMSE: 33.87710734178301
-----------------------------------------------------
FOLD 2
RMSE: 30.3425528497887
-----------------------------------------------------
FOLD 3
RMSE: 30.908377053367243
Working with forecasts from linear...
-----------------------------------------------------
-----------------------------------------------------
FOLD 0
RMSE: 25.909733767351145
-----------------------------------------------------
FOLD 1
RMSE: 32.55539418287357
-----------------------------------------------------
FOLD 2
RMSE: 28.02573800531867
-----------------------------------------------------
FOLD 3
RMSE: 27.512157750399876
Working with forecasts from huber...
-----------------------------------------------------
-----------------------------------------------------
FOLD 0
RMSE: 43.8980188030959
-----------------------------------------------------
FOLD 1
RMSE: 55.40375581530791
-----------------------------------------------------
FOLD 2
RMSE: 46.37734856211295
-----------------------------------------------------
FOLD 3
RMSE: 41.743947334147386
Working with forecasts from lasso...
-----------------------------------------------------
-----------------------------------------------------
FOLD 0
RMSE: 42.212939444847706
-----------------------------------------------------
FOLD 1
RMSE: 58.6828784233263
-----------------------------------------------------
FOLD 2
RMSE: 47.91103516420516
-----------------------------------------------------
FOLD 3
RMSE: 41.89282012115221
```

In [120]:
# xgb_tv_df, xgb_test_fold_preds = residual_trainer(estimator=XGBRegressor, model_kwargs=xgboost_params,)
# dump(xgb_tv_df, predpath/'20220124_residual_oof_residual_xgboost-3000_preds.joblib')
# dump(xgb_test_fold_preds, predpath/'20220124_residual_test_residual_xgboost-3000_preds.joblib')

# loading the 500-estimator version, which is universally better
xgb_tv_df = load(predpath/'20220124_residual_oof_xgboost_preds.joblib')
xgb_test_df = load(predpath/'20220124_residual_test_xgboost_preds.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/preds/20220124_residual_test_residual_xgboost-3000_preds.joblib']

In [90]:
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [94]:
lgb_tv_df, lgb_test_fold_preds = residual_trainer(estimator=LGBMRegressor, model_kwargs=lightgbm_params,)
dump(lgb_tv_df, predpath/'20220124_residual_oof_residual_lightgbm_preds.joblib')
dump(lgb_test_fold_preds, predpath/'20220124_residual_test_residual_lightgbm_preds.joblib')

26298
Working with forecasts from prophet...
-----------------------------------------------------
-----------------------------------------------------
FOLD 0
RMSE: 40.2958973133468
-----------------------------------------------------
FOLD 1
RMSE: 40.72754418203894
-----------------------------------------------------
FOLD 2
RMSE: 34.289662190196665
-----------------------------------------------------
FOLD 3
RMSE: 35.118251779330976
Working with forecasts from neuralprophet...
-----------------------------------------------------
-----------------------------------------------------
FOLD 0
RMSE: 71.44951581377815
-----------------------------------------------------
FOLD 1
RMSE: 47.489823189781696
-----------------------------------------------------
FOLD 2
RMSE: 53.766625686957966
-----------------------------------------------------
FOLD 3
RMSE: 58.62494371250313
Working with forecasts from ridge...
-----------------------------------------------------
----------------------------

['/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/preds/20220124_residual_test_residual_lightgbm_preds.joblib']

In [92]:
cat_tv_df, cat_test_fold_preds = residual_trainer(estimator=CatBoostRegressor, model_kwargs=catboost_params,)
dump(cat_tv_df, predpath/'20220124_residual_oof_residual_catboost_preds.joblib')
dump(cat_test_fold_preds, predpath/'20220124_residual_test_residual_catboost_preds.joblib')

26298
Working with forecasts from prophet...
-----------------------------------------------------
-----------------------------------------------------
FOLD 0
RMSE: 34.85084358418546
-----------------------------------------------------
FOLD 1
RMSE: 39.134690709805604
-----------------------------------------------------
FOLD 2
RMSE: 32.31754105801728
-----------------------------------------------------
FOLD 3
RMSE: 32.297408734705535
Working with forecasts from neuralprophet...
-----------------------------------------------------
-----------------------------------------------------
FOLD 0
RMSE: 64.4820465743269
-----------------------------------------------------
FOLD 1
RMSE: 45.80180130203257
-----------------------------------------------------
FOLD 2
RMSE: 47.036335391797884
-----------------------------------------------------
FOLD 3
RMSE: 54.09250227778184
Working with forecasts from ridge...
-----------------------------------------------------
-----------------------------

['/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/preds/20220124_residual_test_residual_catboost_preds.joblib']

In [None]:
# dump(valid_forecast_preds, predpath/'20220124_forecast_valid2018_preds.joblib')
# dump(test_forecast_preds, predpath/'20220124_forecast_test_preds.joblib')

#### Assembling together residual predictions

In [125]:
tv_dfs = {
    'xgb': xgb_tv_df, 
    'lgb': lgb_tv_df, 
    'cat': cat_tv_df
}

for arch in tv_dfs.keys():
    for forecast_model in forecast_models:
        tv_dfs[arch][f'{forecast_model}_pred'] = tv_preds[forecast_model]
        tv_dfs[arch][f'{forecast_model}_residual'] = tv_dfs[arch]['num_sold'] - tv_preds[forecast_model] #- xgb_tv_df['num_sold']

In [126]:
xgb_tv_df

Unnamed: 0,date,num_sold,prophet_oof_preds,neuralprophet_oof_preds,ridge_oof_preds,linear_oof_preds,huber_oof_preds,lasso_oof_preds,prophet_pred,prophet_residual,neuralprophet_pred,neuralprophet_residual,ridge_pred,ridge_residual,linear_pred,linear_residual,huber_pred,huber_residual,lasso_pred,lasso_residual
0,2015-01-01,329,1.278618,22.734301,34.222641,3.645102,139.376175,166.224564,346.560416,-17.560416,329.134521,-0.134521,283.272258,45.727742,324.188332,4.811668,239.868861,89.131139,176.381725,152.618275
1,2015-01-01,520,-10.982243,42.120220,57.832901,-9.185784,192.085785,217.624084,536.203586,-16.203586,458.008301,61.991699,439.771734,80.228266,503.081569,16.918431,334.618687,185.381313,334.453055,185.546945
2,2015-01-01,146,-5.641450,1.866524,11.319877,-6.932339,66.705956,53.481075,143.412803,2.587197,145.325577,0.674423,120.885683,25.114317,136.667932,9.332068,96.086063,49.913937,89.802390,56.197610
3,2015-01-01,572,7.359212,19.275978,65.995567,11.750156,285.283234,283.822510,590.117165,-18.117165,552.571167,19.428833,488.810710,83.189290,555.531957,16.468043,330.014790,241.985210,307.788806,264.211194
4,2015-01-01,911,0.859362,89.720268,132.802261,-6.488067,122.903603,407.049622,939.673009,-28.673009,781.967285,129.032715,771.701702,139.298298,890.469201,20.530799,875.922092,35.077908,583.510103,327.489897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26293,2018-12-31,823,-3.253311,280.156372,132.241791,-10.704310,465.822357,427.108459,898.322121,-75.322121,669.418457,153.581543,723.714830,99.285170,846.597603,-23.597603,407.206130,415.793870,397.285794,425.714206
26294,2018-12-31,250,-14.439471,11.863194,46.179234,5.300345,118.407372,119.623322,253.512355,-3.512355,227.158142,22.841858,205.880079,44.119921,241.048951,8.951049,132.620712,117.379288,126.765608,123.234392
26295,2018-12-31,1004,-31.400349,337.634491,146.736877,-6.179089,552.454895,531.086609,1039.635205,-35.635205,715.639648,288.360352,832.192362,171.807638,975.785339,28.214661,401.224848,602.775152,430.368706,573.631294
26296,2018-12-31,1441,34.119045,617.672424,227.119736,-13.130022,642.822388,774.511536,1526.908216,-85.908216,980.234009,460.765991,1255.885410,185.114590,1468.776593,-27.776593,863.230649,577.769351,689.775798,751.224202


In [129]:
for arch in tv_dfs.keys(): # xgb, then lgb, then cat
    print(f"\nFor {arch}...\n")
    for forecast_model in forecast_models:
        
        if arch == 'xgb':
            adjusted_smape = SMAPE(y_pred=tv_dfs[arch][f'{forecast_model}_pred']+tv_dfs[arch][f'{forecast_model}_oof_preds'], y_true=tv_dfs[arch]['num_sold'])
        else:
            adjusted_smape = SMAPE(y_pred=tv_dfs[arch][f'{forecast_model}_pred']+tv_dfs[arch][f'{forecast_model}_oof_residual_preds'], y_true=tv_dfs[arch]['num_sold'])
        original_smape = SMAPE(y_pred=tv_dfs[arch][f'{forecast_model}_pred'], y_true=tv_dfs[arch]['num_sold'])
        print(f'Before residual adjustment, SMAPE for {forecast_model} was {original_smape}. Final SMAPE for {forecast_model} is {adjusted_smape}.')


For xgb...

Before residual adjustment, SMAPE for prophet was 7.098824805150043. Final SMAPE for prophet is 6.459337760847558.
Before residual adjustment, SMAPE for neuralprophet was 7.926662675729486. Final SMAPE for neuralprophet is 9.025599197404372.
Before residual adjustment, SMAPE for ridge was 4.781041939812455. Final SMAPE for ridge is 5.4770486671076055.
Before residual adjustment, SMAPE for linear was 4.175573461450916. Final SMAPE for linear is 5.141834427310028.
Before residual adjustment, SMAPE for huber was 14.396222549610735. Final SMAPE for huber is 7.442538563573811.
Before residual adjustment, SMAPE for lasso was 15.11713989060885. Final SMAPE for lasso is 7.000306029463663.

For lgb...

Before residual adjustment, SMAPE for prophet was 7.098824805150043. Final SMAPE for prophet is 5.977672631338595.
Before residual adjustment, SMAPE for neuralprophet was 7.926662675729486. Final SMAPE for neuralprophet is 8.104872610910304.
Before residual adjustment, SMAPE for ridg

With + as operator:
```
Before residual adjustment, SMAPE for prophet was 7.098824805150043. Final SMAPE for prophet is 14.394425598661853.
Before residual adjustment, SMAPE for neuralprophet was 7.930322204823288. Final SMAPE for neuralprophet is 17.10022501507438.
Before residual adjustment, SMAPE for ridge was 4.781041939812455. Final SMAPE for ridge is 9.6272249020869.
Before residual adjustment, SMAPE for linear was 4.175573461450916. Final SMAPE for linear is 8.315020784789288.
Before residual adjustment, SMAPE for huber was 14.396222549610735. Final SMAPE for huber is 30.438323388261274.
Before residual adjustment, SMAPE for lasso was 15.11713989060885. Final SMAPE for lasso is 31.73839575442759.
```

In [130]:
xgb_test_preds_df = pd.DataFrame(xgb_test_fold_preds)
lgb_test_preds_df = pd.DataFrame(lgb_test_fold_preds)
cat_test_preds_df = pd.DataFrame(cat_test_fold_preds)

test_dfs_dict = {
    'xgb': xgb_test_preds_df,
    'lgb': lgb_test_preds_df,
    'cat': cat_test_preds_df,
}

In [131]:
xgb_test_preds_df

Unnamed: 0,prophet_0_residual_preds,prophet_1_residual_preds,prophet_2_residual_preds,prophet_3_residual_preds,neuralprophet_0_residual_preds,neuralprophet_1_residual_preds,neuralprophet_2_residual_preds,neuralprophet_3_residual_preds,ridge_0_residual_preds,ridge_1_residual_preds,...,linear_2_residual_preds,linear_3_residual_preds,huber_0_residual_preds,huber_1_residual_preds,huber_2_residual_preds,huber_3_residual_preds,lasso_0_residual_preds,lasso_1_residual_preds,lasso_2_residual_preds,lasso_3_residual_preds
0,-4.102104,-14.913374,-3.080225,3.018413,-13.194138,-9.770862,4.078837,10.081055,44.700542,45.580715,...,2.724960,-0.174390,121.098564,104.324783,115.000725,134.269119,166.264984,180.267502,164.097778,183.542419
1,4.723720,-27.195826,-26.876904,-8.286407,43.865379,38.422222,35.413857,53.802963,83.547729,69.001053,...,-4.532751,-5.094652,244.659119,196.919464,191.129303,232.659348,231.795670,224.633469,217.380402,241.567947
2,3.392705,-9.823316,-2.409318,5.491077,-11.074921,-17.552523,-8.449777,-4.353706,17.637873,21.686581,...,0.788980,6.874635,58.146481,53.230881,53.513405,65.918106,56.291412,60.583702,51.450211,58.802227
3,19.689138,-27.230070,-2.332937,20.125134,18.200836,-6.246705,10.685670,24.898489,91.927765,68.484474,...,14.829673,7.144485,311.678619,252.858536,284.634705,316.725922,317.367706,315.465668,277.974579,321.037750
4,-29.621655,-35.599377,-15.653322,-16.833036,78.906929,91.135094,97.449638,72.392570,116.650841,133.524628,...,4.686835,-33.556316,91.731354,67.475739,67.338272,92.719971,407.754425,425.642609,395.712799,419.240723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,-118.545227,-45.659576,-60.096733,-2.052044,103.060165,183.951218,164.585052,279.236237,88.467285,128.922409,...,-10.705691,1.563066,325.046143,387.724213,392.680786,421.922607,322.336914,378.124969,414.231171,438.822266
6566,-21.786833,-1.243683,-7.646570,-13.557628,-13.458005,-1.849963,6.081090,33.629280,38.515495,42.727047,...,2.791987,8.528316,89.253609,98.561577,118.342407,113.643692,112.083138,115.539803,125.085037,119.928062
6567,-104.923912,-63.657272,-59.335228,25.098553,189.545441,254.937271,248.879913,362.050140,137.843811,138.724991,...,3.405739,14.622226,486.076782,535.784851,546.256592,604.121521,477.918091,507.334686,534.547546,548.917542
6568,-120.393776,-65.994438,-93.683273,0.634076,415.604767,484.141022,420.275391,567.493896,199.480713,206.807465,...,-29.887718,-34.617683,475.030273,496.340881,496.983246,543.476074,625.415466,679.819031,680.618042,723.934326


In [132]:
lgb_test_preds_df

Unnamed: 0,prophet_0_residual_preds,prophet_1_residual_preds,prophet_2_residual_preds,prophet_3_residual_preds,neuralprophet_0_residual_preds,neuralprophet_1_residual_preds,neuralprophet_2_residual_preds,neuralprophet_3_residual_preds,ridge_0_residual_preds,ridge_1_residual_preds,...,linear_2_residual_preds,linear_3_residual_preds,huber_0_residual_preds,huber_1_residual_preds,huber_2_residual_preds,huber_3_residual_preds,lasso_0_residual_preds,lasso_1_residual_preds,lasso_2_residual_preds,lasso_3_residual_preds
0,30.659798,-5.406039,25.685431,25.702927,38.625799,-26.543532,42.386490,41.566642,62.010359,38.643447,...,-7.667029,-8.715803,169.639647,122.723280,156.201496,165.882975,203.273303,166.865059,204.484263,206.633983
1,23.326129,-14.781889,7.569961,14.815576,74.046034,-2.021053,72.035079,59.873683,86.575967,59.412709,...,-21.454504,-15.185340,255.575499,162.094788,245.791512,271.419730,274.603292,189.301795,253.664893,277.652861
2,14.903635,9.096575,19.581296,16.245287,17.059769,-4.708975,18.002370,15.954930,28.641066,15.127044,...,-0.398582,-0.326677,86.313253,71.875810,82.106209,87.898212,75.172092,61.772996,77.285651,71.062885
3,32.888279,-13.094183,39.609446,44.406229,39.773920,-34.176598,61.832559,54.490520,99.549027,55.405590,...,-16.596746,-22.202567,336.296909,271.771754,340.399942,346.054952,321.728073,255.686679,326.044949,327.006351
4,-3.216878,-10.839813,27.319067,10.627042,125.004497,63.631370,149.099507,117.800786,124.353946,114.335094,...,-36.342601,-57.980621,147.501199,63.622507,148.752113,144.124544,445.731385,345.544891,455.457338,435.547905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,-68.673045,-56.764350,-62.810567,1.022159,180.027155,255.176557,147.445339,248.247112,120.124596,137.676273,...,3.876652,1.078910,364.930049,384.275880,379.192802,431.717952,376.910966,384.656657,387.463713,413.815027
6566,-5.854447,-6.348483,-13.550638,7.160769,11.840255,27.455167,18.720853,14.228303,46.428923,49.226229,...,16.116670,11.648676,120.257638,118.102620,112.660298,127.334350,129.797733,129.195569,124.898281,131.851480
6567,-42.575155,-77.468770,-69.646457,14.660957,298.763778,315.009559,218.779005,354.199629,169.165384,156.205672,...,17.859898,22.850640,535.992099,557.989588,526.079526,604.837679,508.729383,505.244678,511.136612,537.605524
6568,-74.562045,-34.338660,-65.610515,36.533271,508.040128,569.731072,394.333840,553.090571,220.473254,231.219097,...,-9.653883,-24.673348,527.189767,523.864992,502.212720,565.484246,672.356991,709.551573,644.943501,708.009935


## Submission of Naive Forecasts

Something's up with the residuals; for now, let's just run a simple model on the original forecasts.

In [100]:
# model = Lasso()

In [101]:
# tv_preds

In [102]:
# X = tv_preds.drop(columns=['num_sold'])
# y = tv_preds['num_sold']

In [103]:
# X_test = test_forecast_preds

In [104]:
# X['date'] = pd.to_datetime(X.date)

In [105]:
# X['date'] = X['date'].map(dt.datetime.toordinal)
# model.fit(X,y)

In [106]:
# # X_test['date'] = X['date'].map(dt.datetime.toordinal)
# X_test['date'] = pd.to_datetime(X_test.date)
# X_test['date'] = X_test['date'].map(dt.datetime.toordinal)

In [107]:
# lasso_test_preds = model.predict(X_test)

In [108]:
# lasso_tv_preds = model.predict(X)

In [109]:
# SMAPE(y_pred=lasso_tv_preds, y_true=y)

In [110]:
# sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [111]:
# sample_df.loc[:, 'num_sold'] = lasso_test_preds

In [112]:
# sample_df.head()

In [113]:
# sample_df.to_csv(subpath/f"20220124_forecasts_lasso_preds.csv", index=False)


This got LB 4.40223

## Submission of Adjusted XGB Naivish

In [135]:
# oof_features = ['']
adjusted_oof_preds = pd.DataFrame({
    'date': orig_train_df['date'],
})

tv_dfs_dict = {
    'xgb': xgb_tv_df,
    'lgb': lgb_tv_df,
    'cat': cat_tv_df,
}

for arch in ['xgb', 'lgb', 'cat']:
    print(f"{arch}\n")
    for forecast_model in forecast_models:
        if arch == 'xgb':
                        adjusted_oof_preds[f"{arch}+{forecast_model}"] = tv_dfs_dict[arch][f'{forecast_model}_pred']+tv_dfs_dict[arch][f'{forecast_model}_oof_preds']
        else:
            adjusted_oof_preds[f"{arch}+{forecast_model}"] = tv_dfs_dict[arch][f'{forecast_model}_pred']+tv_dfs_dict[arch][f'{forecast_model}_oof_residual_preds']

xgb

lgb

cat



In [136]:
adjusted_oof_preds

Unnamed: 0,date,xgb+prophet,xgb+neuralprophet,xgb+ridge,xgb+linear,xgb+huber,xgb+lasso,lgb+prophet,lgb+neuralprophet,lgb+ridge,lgb+linear,lgb+huber,lgb+lasso,cat+prophet,cat+neuralprophet,cat+ridge,cat+linear,cat+huber,cat+lasso
0,2015-01-01,347.839034,351.868822,317.494899,327.833433,379.245036,342.606288,335.467823,332.675307,327.001013,321.609870,354.488671,333.410991,333.915969,327.030192,314.826055,317.206583,346.279165,336.233347
1,2015-01-01,525.221343,500.128521,497.604635,493.895785,526.704472,552.077139,519.648545,473.685888,501.364338,499.126331,492.573564,541.304068,509.140981,494.025583,506.135450,495.753502,495.232955,522.987775
2,2015-01-01,137.771353,147.192100,132.205560,129.735593,162.792018,143.283465,148.667331,138.844205,137.737256,131.750294,154.851398,136.278547,142.338521,123.486544,128.473396,133.452584,145.584398,133.919333
3,2015-01-01,597.476377,571.847145,554.806277,567.282113,615.298024,591.611316,590.189094,556.542029,559.641560,565.843105,593.972827,571.047420,583.211954,622.898234,569.523275,547.732239,615.549186,636.556795
4,2015-01-01,940.532371,871.687553,904.503964,883.981135,998.825694,990.559725,921.358231,850.556192,888.371783,899.732611,920.105059,926.419508,935.543658,943.701398,898.809221,881.201439,964.717169,1008.016399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26293,2018-12-31,895.068810,949.574829,855.956621,835.893293,873.028487,824.394254,841.557771,924.595014,861.391103,843.959215,791.482009,781.942451,927.389893,968.623145,854.088327,848.968382,811.147037,832.155634
26294,2018-12-31,239.072883,239.021337,252.059312,246.349297,251.028084,246.388929,247.163872,254.613309,255.106308,240.583134,250.723332,255.961176,216.592747,265.774641,250.231972,244.564056,223.426179,236.455523
26295,2018-12-31,1008.234856,1053.274139,978.929240,969.606250,953.679743,961.455315,962.166435,1030.649207,988.398034,968.586942,959.214436,935.613384,1075.349456,1073.352314,971.048282,969.551301,900.275331,944.275194
26296,2018-12-31,1561.027261,1597.906433,1483.005146,1455.646571,1506.053037,1464.287333,1492.569556,1549.965081,1487.104507,1453.843170,1387.095641,1399.327370,1663.235599,1589.577839,1468.298331,1468.509798,1504.615944,1418.220415


In [137]:
adjusted_oof_preds['num_sold'] = orig_train_df['num_sold']

In [138]:
model = Lasso()

In [139]:
X = adjusted_oof_preds.drop(columns=['num_sold'])
y = adjusted_oof_preds['num_sold']

In [141]:
adjusted_test_preds = pd.DataFrame({
    'date': orig_test_df['date']
})

# tv_dfs_dict = {
#     'xgb': xgb_tv_df,
#     'lgb': lgb_tv_df,
#     'cat': cat_tv_df,
# }

for arch in ['xgb', 'lgb', 'cat']: #prophet_0_residual_preds
    for forecast_model in forecast_models:
        adjusted_test_preds[f'{arch}+{forecast_model}'] = sum([test_dfs_dict[arch][f'{forecast_model}_{i}_residual_preds'] for i in range(4)]) / 4
adjusted_test_preds

Unnamed: 0,date,xgb+prophet,xgb+neuralprophet,xgb+ridge,xgb+linear,xgb+huber,xgb+lasso,lgb+prophet,lgb+neuralprophet,lgb+ridge,lgb+linear,lgb+huber,lgb+lasso,cat+prophet,cat+neuralprophet,cat+ridge,cat+linear,cat+huber,cat+lasso
0,2019-01-01,-4.769322,-2.201277,46.598244,-1.954294,118.673294,173.543167,19.160529,24.008850,56.885965,-8.879925,153.611849,195.314152,-14.693149,-2.291287,39.869848,-14.322520,132.503743,170.387482
1,2019-01-01,-14.408854,42.876106,72.012451,-0.495677,216.341812,228.844376,7.732444,50.983436,74.323495,-13.224976,233.720382,248.805710,-25.254263,41.627735,75.834353,-11.974044,202.962403,226.909551
2,2019-01-01,-0.837213,-10.357732,19.717422,1.525960,57.702221,56.781887,14.956698,11.577024,25.427122,-2.297773,82.048371,71.323406,-1.713054,-9.950343,16.343637,-6.888167,68.528241,53.385513
3,2019-01-01,2.562816,11.884573,82.580994,4.293141,291.474457,307.961426,25.952443,30.480100,87.610350,-18.178226,323.630889,307.616513,-10.082857,61.018263,81.140506,-19.407028,301.357366,333.277065
4,2019-01-01,-24.426847,84.971062,125.501450,-15.061462,79.816330,412.087646,5.972354,113.884040,123.408291,-39.425201,126.000091,420.570380,-8.440692,170.794424,133.047662,-19.296734,108.357982,440.198751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,2019-12-31,-56.588398,182.708160,115.503525,-8.565734,381.843445,388.378845,-46.806451,207.724041,128.891600,-2.203235,390.029170,390.711591,-14.039291,258.248310,119.144230,-2.358119,393.269959,416.836879
6566,2019-12-31,-11.058679,6.100600,39.134518,7.037920,104.950317,118.159012,-4.648200,18.061144,47.770945,9.504077,119.588726,128.935765,-39.302952,7.053310,38.123140,4.600495,88.419512,106.974697
6567,2019-12-31,-50.704464,263.853210,147.788803,3.021900,543.059937,517.179443,-43.757356,296.687993,161.704710,12.331346,556.224723,515.679049,19.582368,331.517462,138.763121,3.077579,493.934774,510.334507
6568,2019-12-31,-69.859352,471.878784,197.818512,-16.968071,502.957611,677.446716,-34.494487,506.298903,215.059298,-15.512250,529.687931,683.715500,91.116775,577.465276,199.991247,-8.266427,631.048378,713.978299


In [142]:
test_forecast_preds

Unnamed: 0,date,prophet,neuralprophet,ridge,linear,huber,lasso
0,2019-01-01,402.296576,383.543457,359.817081,392.328390,248.728792,221.118411
1,2019-01-01,631.642099,551.044983,560.783324,620.092467,347.337935,347.623611
2,2019-01-01,169.992094,173.065674,153.552940,166.788184,99.483226,112.590591
3,2019-01-01,684.559821,648.783691,622.237644,675.674104,342.549613,384.790895
4,2019-01-01,1092.875362,965.683716,969.065554,1084.980801,911.102831,601.189400
...,...,...,...,...,...,...,...
6565,2019-12-31,917.085861,674.891174,742.331990,797.964290,404.876685,391.134492
6566,2019-12-31,261.569590,232.082169,210.920805,225.576610,131.774790,129.930008
6567,2019-12-31,1065.400121,730.457275,852.091174,925.788409,398.926697,440.077496
6568,2019-12-31,1568.407214,1005.392944,1286.814471,1397.071983,858.595973,677.887045


In [143]:
dump(adjusted_test_preds, predpath/'20220125_residual_test_predictions_averaged_by_fold_using_Big3-GBMS.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/preds/20220125_residual_test_predictions_averaged_by_fold_using_Big3-GBMS.joblib']

In [144]:
dump(adjusted_oof_preds, predpath/'20220125_final_oof_predictions_forecast+Big3GBM-residuals.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/preds/20220125_final_oof_predictions_forecast+Big3GBM-residuals.joblib']

In [145]:
adjusted_test_residual_preds = adjusted_test_preds.copy() # fixing the confusing name

In [147]:
adjusted_test_preds = pd.DataFrame({
    'date': orig_test_df['date']
})

for arch in ['xgb', 'lgb', 'cat']:
    for forecast_model in forecast_models:
        adjusted_test_preds[f'{arch}+{forecast_model}'] = test_forecast_preds[f'{forecast_model}'] + adjusted_test_residual_preds[f'{arch}+{forecast_model}']

In [148]:
adjusted_test_preds

Unnamed: 0,date,xgb+prophet,xgb+neuralprophet,xgb+ridge,xgb+linear,xgb+huber,xgb+lasso,lgb+prophet,lgb+neuralprophet,lgb+ridge,lgb+linear,lgb+huber,lgb+lasso,cat+prophet,cat+neuralprophet,cat+ridge,cat+linear,cat+huber,cat+lasso
0,2019-01-01,397.527253,381.342180,406.415325,390.374097,367.402086,394.661578,421.457105,407.552307,416.703046,383.448465,402.340641,416.432563,387.603427,381.252170,399.686929,378.005871,381.232535,391.505893
1,2019-01-01,617.233244,593.921089,632.795775,619.596790,563.679747,576.467987,639.374543,602.028419,635.106819,606.867491,581.058317,596.429322,606.387836,592.672717,636.617677,608.118423,550.300338,574.533162
2,2019-01-01,169.154881,162.707942,173.270363,168.314144,157.185447,169.372478,184.948792,184.642697,178.980063,164.490411,181.531597,183.913997,168.279040,163.115331,169.896578,159.900016,168.011467,165.976105
3,2019-01-01,687.122637,660.668264,704.818638,679.967246,634.024070,692.752321,710.512264,679.263792,709.847994,657.495878,666.180502,692.407408,674.476965,709.801954,703.378151,656.267077,643.906978,718.067960
4,2019-01-01,1068.448515,1050.654778,1094.567004,1069.919339,990.919161,1013.277046,1098.847716,1079.567756,1092.473845,1045.555600,1037.102922,1021.759780,1084.434670,1136.478140,1102.113216,1065.684067,1019.460813,1041.388151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,2019-12-31,860.497464,857.599335,857.835514,789.398556,786.720130,779.513337,870.279411,882.615215,871.223590,795.761055,794.905856,781.846082,903.046571,933.139484,861.476220,795.606171,798.146644,807.971371
6566,2019-12-31,250.510911,238.182769,250.055323,232.614530,236.725107,248.089020,256.921390,250.143313,258.691751,235.080687,251.363516,258.865774,222.266638,239.135478,249.043946,230.177105,220.194302,236.904705
6567,2019-12-31,1014.695657,994.310486,999.879978,928.810309,941.986633,957.256940,1021.642765,1027.145268,1013.795884,938.119756,955.151420,955.756545,1084.982489,1061.974737,990.854295,928.865988,892.861471,950.412003
6568,2019-12-31,1498.547862,1477.271729,1484.632983,1380.103912,1361.553584,1355.333761,1533.912727,1511.691847,1501.873769,1381.559732,1388.283904,1361.602545,1659.523989,1582.858220,1486.805718,1388.805555,1489.644351,1391.865343


In [149]:
X_test = adjusted_test_preds

In [150]:
X['date'] = pd.to_datetime(X.date)

In [151]:
X['date'] = X['date'].map(dt.datetime.toordinal)
model.fit(X,y) # this is the lasso regressor

Lasso()

In [152]:
# X_test['date'] = X['date'].map(dt.datetime.toordinal)
X_test['date'] = pd.to_datetime(X_test.date)
X_test['date'] = X_test['date'].map(dt.datetime.toordinal)

In [153]:
lasso_test_preds = model.predict(X_test)

In [154]:
lasso_tv_preds = model.predict(X)

Naive SMAPE was 4.152715109788985

In [155]:
SMAPE(y_pred=lasso_tv_preds, y_true=y)

4.384919101043075

In [160]:
ridge = Ridge()
ridge.fit(X,y)
ridge_test_preds = ridge.predict(X_test)
ridge_tv_preds = ridge.predict(X)
SMAPE(y_pred=ridge_tv_preds, y_true=y)


4.276633983822924

In [161]:
sample_df = pd.read_csv(datapath/'sample_submission.csv')

In [162]:
sample_df.loc[:, 'num_sold'] = ridge_test_preds

In [163]:
sample_df.head()

Unnamed: 0,row_id,num_sold
0,26298,375.711427
1,26299,609.169189
2,26300,156.729623
3,26301,660.152883
4,26302,1073.205078


In [164]:
sample_df.to_csv(subpath/f"20220125_forecasts+Big3-GBMs-residuals_ridge_preds.csv", index=False)
# sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-X_orig+KMeans8+synth-GBM-stack_ensemble_preds.csv", index=False)

This got LB 4.79516