In [1]:
# notebook configuration
# if '/sf/' in pwd:
#     COLAB, SAGE = False, False
# elif 'google.colab' in str(get_ipython()):
#     COLAB, SAGE = True, False # do colab-specific installs later
# else:
#     COLAB, SAGE = False, True
    
CONTEXT = 'local' # or 'colab', 'sage', 'kaggle'
USE_GPU = True 
%config Completer.use_jedi = False

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import requests # for telegram notifications
from tqdm.notebook import tqdm

from joblib import dump, load

import datetime as dt

In [3]:
# model selection
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

# metrics
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

# normalization
# from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
# from gauss_rank_scaler import GaussRankScaler

# feature generation
# import category_encoders as ce

# models
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
# from pytorch_widedeep import Trainer
# from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
# from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT#, TabTransformer, TabNet, TabFastFormer, TabResnet
# from pytorch_widedeep.metrics import Accuracy
# from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

# feature reduction
# from sklearn.decomposition import PCA
# from umap import UMAP

# clustering
# from sklearn.cluster import DBSCAN, KMeans
# import hdbscan

# feature selection
# from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
# import featuretools as ft
# from BorutaShap import BorutaShap
# from boruta import BorutaPy

# tracking 
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
os.environ['WANDB_NOTEBOOK_NAME'] = f"nb_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# # time series
# import tsfresh

# import darts
# from darts import TimeSeries
# from darts.models import ExponentialSmoothing, AutoARIMA, ARIMA, Prophet, RandomForest, RegressionEnsembleModel, RegressionModel, TFTModel, TCNModel, TransformerModel, NBEATSModel
import holidays
import dateutil.easter as easter
from prophet import Prophet
from neuralprophet import NeuralProphet

In [5]:
if CONTEXT == 'colab':
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    # datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/dec2021/')
    root = Path('') # TODO

elif CONTEXT == 'sage':
    root = Path('') # TODO
    
elif CONTEXT == 'kaggle':
    root = Path('') # TODO
    
else: # if on local machine
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    studypath = root/'studies'
    
    for pth in [datapath, predpath, subpath, studypath]:
        pth.mkdir(exist_ok=True)

In [6]:
SEED = 42

# Function to seed everything but the models
def seed_everything(seed, pytorch=True, reproducible=True):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if pytorch:
        torch.manual_seed(seed) # set torch CPU seed
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed) # set torch GPU(s) seed(s)
        if reproducible and torch.backends.cudnn.is_available():
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

seed_everything(seed=SEED)

In [7]:
def reduce_memory_usage(df, verbose=True):
    """
    Function to reduce memory usage by downcasting datatypes in a Pandas DataFrame when possible.
    
    h/t to Bryan Arnold (https://www.kaggle.com/puremath86/label-correction-experiments-tps-nov-21)
    """
    
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [8]:
tg_api_token = 'your_api_token' # for Galileo (jupyter_watcher_bot) on Telegram
tg_chat_id = 'your_chat_id'

import requests

def send_tg_message(text='Cell execution completed.'):  
    """
    h/t Ivan Dembicki Jr. for the base version 
    (https://medium.com/@ivan.dembicki.jr/notifications-in-jupyter-notebook-with-telegram-f2e892c55173)
    """
    requests.post('https://api.telegram.org/' +  'bot{}/sendMessage'.format(tg_api_token),
                  params=dict(chat_id=tg_chat_id, text=text))

In [9]:
def SMAPE(y_true, y_pred):
    '''
    h/t Jean-François Puget (@CPMP) -- see https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
    '''
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [10]:
# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/282735
def better_than_median(inputs, axis):
    """Compute the mean of the predictions if there are no outliers,
    or the median if there are outliers.

    Parameter: inputs = ndarray of shape (n_samples, n_folds)"""
    spread = inputs.max(axis=axis) - inputs.min(axis=axis) 
    spread_lim = 0.45
    print(f"Inliers:  {(spread < spread_lim).sum():7} -> compute mean")
    print(f"Outliers: {(spread >= spread_lim).sum():7} -> compute median")
    print(f"Total:    {len(inputs):7}")
    return np.where(spread < spread_lim,
                    np.mean(inputs, axis=axis),
                    np.median(inputs, axis=axis))

In [11]:
# https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series
def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

In [12]:
# https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series
def fourier_features(index, freq, order):
    time = np.arange(len(index), dtype=np.float32)
    k = 2 * np.pi * (1 / freq) * time
    features = {}
    for i in range(1, order + 1):
        features.update({
            f"sin_{freq}_{i}": np.sin(i * k),
            f"cos_{freq}_{i}": np.cos(i * k),
        })
    return pd.DataFrame(features, index=index)

In [13]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    'train_source': str(datapath/'train.csv'),
    'target_source': str(datapath/'train.csv'),
    'test_source': str(datapath/'test.csv'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
train_df = pd.read_csv(datapath/'train.csv')
test_df = pd.read_csv(datapath/'test.csv')
orig_train_df = train_df.copy()
orig_test_df = test_df.copy()

In [14]:
# https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model
for df in [train_df, test_df]:
    df['date'] = pd.to_datetime(df.date)

# for convenience later
countries = ['Sweden', 'Finland', 'Norway']
stores = ['KaggleMart', 'KaggleRama']
products = ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']

In [15]:
all_df = pd.concat([train_df, test_df], axis=0)
# all_df.columns
print(len(all_df) == len(train_df) + len(test_df))
del train_df, test_df

In [16]:
def add_gdp_data(df):
    gdp_df = pd.read_csv(datapath/'GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
    gdp_df.set_index('year', inplace=True)
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]

    df['gdp'] = np.log1p(df.apply(get_gdp, axis=1))
    return df

In [17]:
gdp_exponent = 1.2121103201489674 # see https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model for an explanation

In [18]:
all_df = add_gdp_data(all_df)

In [19]:
all_df

      row_id       date  country       store         product  num_sold  \
0          0 2015-01-01  Finland  KaggleMart      Kaggle Mug     329.0   
1          1 2015-01-01  Finland  KaggleMart      Kaggle Hat     520.0   
2          2 2015-01-01  Finland  KaggleMart  Kaggle Sticker     146.0   
3          3 2015-01-01  Finland  KaggleRama      Kaggle Mug     572.0   
4          4 2015-01-01  Finland  KaggleRama      Kaggle Hat     911.0   
...      ...        ...      ...         ...             ...       ...   
6565   32863 2019-12-31   Sweden  KaggleMart      Kaggle Hat       NaN   
6566   32864 2019-12-31   Sweden  KaggleMart  Kaggle Sticker       NaN   
6567   32865 2019-12-31   Sweden  KaggleRama      Kaggle Mug       NaN   
6568   32866 2019-12-31   Sweden  KaggleRama      Kaggle Hat       NaN   
6569   32867 2019-12-31   Sweden  KaggleRama  Kaggle Sticker       NaN   

           gdp  
0     5.461456  
1     5.461456  
2     5.461456  
3     5.461456  
4     5.461456  
...      

In [20]:
for c in [holidays.Finland, holidays.Sweden, holidays.Norway]:
#     print(c)
    for h in c(years = [2019], observed=True).items():
#         print(h)
        pass

In [21]:
def temporal_engineering(df):
    '''
    Function inspired by / borrowing from @teckmengwong and @ambrosm to create time features that will
    capture seasonality.
    '''
    
#     df[YEAR] = df[DATE].dt.year
    df['month'] = df['date'].dt.month
#     df['week'] = df['date'].dt.week # not used by Teck Meng Wong
#     df['day'] = df['date'].dt.day # not used by Teck Meng Wong
#     df['day_of_year'] = df['date'].dt.dayofyear # not used by Teck Meng Wong
#     df['day_of_month'] = df['date'].dt.days_in_month # not used by Teck Meng Wong
#     df['day_of_week'] = df['date'].dt.dayofweek # not used by Teck Meng Wong
#    df['weekday'] = df['date'].dt.weekday # not used by Teck Meng Wong
    # Teck Meng Wong mapped the integers to first-letters in triplets
    # I'm leaving it as integers, where winter=1, spring=2, summer=3, fall=4
    df['season'] = ((df['date'].dt.month % 12 + 3) // 3) #.map({1:'DJF', 2: 'MAM', 3:'JJA', 4:'SON'})
#     df['month'] = df['month'].apply(lambda x: calendar.month_abbr[x])

    df['wd4'] = df['date'].dt.weekday == 4
    df['wd56'] = df['date'].dt.weekday >= 5
#     df['wd6'] = df['date'].dt.weekday >= 6
#     df.loc[(df.date.dt.year != 2016) & (df.date.dt.month >=3), 'day_of_year'] += 1 # fix for leap years
    
    # 21 days cyclic for lunar
    dayofyear = df.date.dt.dayofyear # for convenience
    
    # here he's creating Fourier features
    for k in range(1, 32, 4):
        df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        df[f'finland_sin{k}'] = np.where(df['country'] == 'Finland', df[f'sin{k}'], 0)
        df[f'finland_cos{k}'] = np.where(df['country'] == 'Finland', df[f'cos{k}'], 0)
        df[f'norway_sin{k}'] = np.where(df['country'] == 'Norway', df[f'sin{k}'], 0)
        df[f'norway_cos{k}'] = np.where(df['country'] == 'Norway', df[f'cos{k}'], 0)
        df[f'store_sin{k}'] = np.where(df['store'] == 'KaggleMart', df[f'sin{k}'], 0)
        df[f'store_cos{k}'] = np.where(df['store'] == 'KaggleMart', df[f'cos{k}'], 0)
        df[f'mug_sin{k}'] = np.where(df['product'] == 'Kaggle Mug', df[f'sin{k}'], 0)
        df[f'mug_cos{k}'] = np.where(df['product'] == 'Kaggle Mug', df[f'cos{k}'], 0)
        df[f'sticker_sin{k}'] = np.where(df['product'] == 'Kaggle Sticker', df[f'sin{k}'], 0)
        df[f'sticker_cos{k}'] = np.where(df['product'] == 'Kaggle Sticker', df[f'cos{k}'], 0)
    
#     df[f'semiweekly_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 14)
#     df[f'semiweekly_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 14)
#     df[f'lunar_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 21)
#     df[f'lunar_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 21)
    df[f'season_sin'] = np.sin(dayofyear / 365 * 2 * math.pi * 91.5)
    df[f'season_cos'] = np.cos(dayofyear / 365 * 2 * math.pi * 91.5)
#     df = pd.concat([df, pd.DataFrame({f'fin{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Finland')
#                                       for ptr in holidays.Finland(years = [2015,2016,2017,2018,2019]).items()})], axis=1)
#     df = pd.concat([df, pd.DataFrame({f'nor{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Norway')
#                                       for ptr in holidays.Norway(years = [2015,2016,2017,2018,2019]).items()})], axis=1)
#     df = pd.concat([df, pd.DataFrame({f'swe{ptr[1]}':
#                                       (df.date == pd.Timestamp(ptr[0])) & (df.country == 'Sweden')
#                                       for ptr in holidays.Sweden(years = [2015,2016,2017,2018,2019]).items()})], axis=1)

    # End of year
    # Dec - teckmengwong
    for d in range(24, 32):
        df[f"dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d)
    # I'm unsure of the logic of only doing this for Norway
    for d in range(24, 32):
        df[f"n-dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
    
    # not sure why he's using different date ranges for each country here
    # Jan - teckmengwong
    for d in range(1, 14):
        df[f"f-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
    for d in range(1, 10):
        df[f"n-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
    for d in range(1, 15):
        df[f"s-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
    
    
    # May - tekcmengwong
    for d in list(range(1, 10)): # May Day and after, I guess
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d)
    for d in list(range(19, 26)):
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
    # June 
    for d in list(range(8, 14)):
        df[f"june{d}"] = (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
    
    #Swedish Rock Concert - teckmengwong
    #Jun 3, 2015 – Jun 6, 2015
    #Jun 8, 2016 – Jun 11, 2016
    #Jun 7, 2017 – Jun 10, 2017
    #Jun 6, 2018 – Jun 10, 2018
    #Jun 5, 2019 – Jun 8, 2019
    swed_rock_fest  = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-6')),
                                         2016: pd.Timestamp(('2016-06-11')),
                                         2017: pd.Timestamp(('2017-06-10')),
                                         2018: pd.Timestamp(('2018-06-10')),
                                         2019: pd.Timestamp(('2019-06-8'))})

    df = pd.concat([df, pd.DataFrame({f"swed_rock_fest{d}":
                                      (df.date - swed_rock_fest == np.timedelta64(d, "D")) & (df.country == 'Sweden')
                                      for d in list(range(-3, 3))})], axis=1)

    
    # Last Wednesday of June - teckmengwong
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    for d in list(range(-4, 6)):
        df[f"wed_june{d}"] = (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
        
    # First Sunday of November - teckmengwong
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df = pd.concat([df, pd.DataFrame({f"sun_nov{d}":
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country == 'Norway')
                                      for d in list(range(0, 9))})], axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December) -teckmengwong
    df = pd.concat([df, pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))})], axis=1)
    
    # Easter -teckmengwong
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df = pd.concat([df, pd.DataFrame({f"easter{d}":
                                      (df.date - easter_date == np.timedelta64(d, "D"))
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})], axis=1)
    
    return df

In [22]:
temporal_all_df = temporal_engineering(all_df)

In [23]:
temporal_all_df

      row_id       date  country       store         product  num_sold  \
0          0 2015-01-01  Finland  KaggleMart      Kaggle Mug     329.0   
1          1 2015-01-01  Finland  KaggleMart      Kaggle Hat     520.0   
2          2 2015-01-01  Finland  KaggleMart  Kaggle Sticker     146.0   
3          3 2015-01-01  Finland  KaggleRama      Kaggle Mug     572.0   
4          4 2015-01-01  Finland  KaggleRama      Kaggle Hat     911.0   
...      ...        ...      ...         ...             ...       ...   
6565   32863 2019-12-31   Sweden  KaggleMart      Kaggle Hat       NaN   
6566   32864 2019-12-31   Sweden  KaggleMart  Kaggle Sticker       NaN   
6567   32865 2019-12-31   Sweden  KaggleRama      Kaggle Mug       NaN   
6568   32866 2019-12-31   Sweden  KaggleRama      Kaggle Hat       NaN   
6569   32867 2019-12-31   Sweden  KaggleRama  Kaggle Sticker       NaN   

           gdp  month  season    wd4  ...  easter47  easter50  easter51  \
0     5.461456      1       1  False

In [24]:
for df in [temporal_all_df]:
    df['target'] = np.log(df['num_sold'] / df['gdp']**gdp_exponent)

In [25]:
# encoded_all_df['target'] = np.log(encoded_all_df['num_sold'] / (encoded_all_df['gdp']**gdp_exponent))

In [26]:
temporal_all_df

      row_id       date  country       store         product  num_sold  \
0          0 2015-01-01  Finland  KaggleMart      Kaggle Mug     329.0   
1          1 2015-01-01  Finland  KaggleMart      Kaggle Hat     520.0   
2          2 2015-01-01  Finland  KaggleMart  Kaggle Sticker     146.0   
3          3 2015-01-01  Finland  KaggleRama      Kaggle Mug     572.0   
4          4 2015-01-01  Finland  KaggleRama      Kaggle Hat     911.0   
...      ...        ...      ...         ...             ...       ...   
6565   32863 2019-12-31   Sweden  KaggleMart      Kaggle Hat       NaN   
6566   32864 2019-12-31   Sweden  KaggleMart  Kaggle Sticker       NaN   
6567   32865 2019-12-31   Sweden  KaggleRama      Kaggle Mug       NaN   
6568   32866 2019-12-31   Sweden  KaggleRama      Kaggle Hat       NaN   
6569   32867 2019-12-31   Sweden  KaggleRama  Kaggle Sticker       NaN   

           gdp  month  season    wd4  ...  easter50  easter51  easter52  \
0     5.461456      1       1  False

In [27]:
def label_encoder(df):
    from sklearn.preprocessing import LabelEncoder
    features = ['country', 'product', 'store']
    le_dict = {feature: LabelEncoder().fit(orig_train_df[feature]) for feature in features}
    enc_df = df.copy()
    for feature in features:
        enc_df[feature] = le_dict[feature].transform(df[feature])
    return le_dict, enc_df

In [28]:
# for key in le_dict.keys():
#     print(f"Values for key {key} are {le_dict[key].inverse_transform(range(len(le_dict[key].values())))}")#"
# print(le_dict['country'].inverse_transform([0,1,2]))
# print(le_dict['product'].inverse_transform([0,1,2]))
# print(le_dict['store'].inverse_transform([0,1]))

In [29]:
# encoded_all_df = label_encoder(temporal_all_df)

In [30]:
# all_df = encoded_all_df.drop(columns=['num_sold', 'row_id'])
all_df = temporal_all_df.drop(columns=['row_id']) # writing over the previous version of `all_df`

In [31]:
tv_df = all_df[:len(orig_train_df)] # training and validation sets -- still not encoded
test_df = all_df[len(orig_train_df):] # still not encoded

In [32]:
# train_df = encoded_all_df.iloc[np.where(encoded_all_df['date'] < '2019-01-01'), :]
# test_df = encoded_all_df[[np.where(encoded_all_df['date'] > '2018-12-31')]]

# encoded_tv_df = encoded_all_df.drop(columns=['row_id'])[:len(orig_train_df)]
# encoded_test_df = encoded_all_df.drop(columns=['row_id'])[len(orig_train_df):]

# valid_df = tv_df[tv_df['date'] > '2017-12-31']
# train_df = tv_df[tv_df['date'] <= '2017-12-31']

# train_and_valid_residual_df = train_and_valid_df.drop(columns=['date'])
# test_residual_df = test_df.drop(columns=['date'])

# len(valid_df) + len(train_df) == len(tv_df)

# encoded_tv_df

In [33]:
from sklearn.linear_model import Ridge, HuberRegressor, LinearRegression, Lasso
from sklearn.neural_network import MLPRegressor
from prophet import Prophet
from neuralprophet import NeuralProphet
# earth? wouldn't install via pip on my machine at first

In [34]:
import torch
import torch.nn as nn
from torch.nn.utils import weight_norm
from skorch import NeuralNetRegressor
import torch.optim as optim

In [35]:
# prophet_trainset = load(predpath/'20220121_prophet_baseline_trainset.joblib')

# neural_trainset = load(predpath/'20220121_neuralprophet_baseline_trainset.joblib')
# neural_test_preds = load(predpath/'20220121_neuralprophet_baseline_testset.joblib')

# ridge_tv_preds = load(predpath/'20210121_ridge_baseline_trainset_preds.joblib')
# ridge_test_preds = load(predpath/'20220121_ridge_testset_preds.joblib')

In [36]:
# neural_tv_preds = neural_trainset['prophet_forecast']
# prophet_tv_preds = prophet_trainset['prophet_forecast']

# neural_train_preds = neural_tv_preds[:train_length]
# neural_valid_preds = neural_tv_preds[train_length:]

# prophet_train_preds = prophet_tv_preds[:train_length]
# prophet_valid_preds = prophet_tv_preds[train_length:]

# train_length = len(neural_trainset[neural_trainset['date'] <= '2017-12-31'])

# ridge_train_preds = ridge_tv_preds[:train_length]
# ridge_valid_preds = ridge_tv_preds[train_length:]

In [37]:
# train_linear_df = train_df.copy()
# valid_linear_df = valid_df.copy()
# test_linear_df = test_df.copy()
# tv_linear_df = tv_df.copy()

In [38]:
prophet_kwargs = {
    'growth':'linear',
#     'holidays':holidays_train, # will add this in-function
    'n_changepoints':10,
    'changepoint_range':0.4,
    'yearly_seasonality':True,
    'weekly_seasonality':True,
    'daily_seasonality':False,
    'seasonality_mode':'additive',
    'seasonality_prior_scale':25,
    'holidays_prior_scale':100,
    'changepoint_prior_scale':0.01,
    'interval_width':0.5,
    'uncertainty_samples':False
}

neuralprophet_kwargs = {
    'growth':'linear',
    'n_changepoints':10,
    'changepoints_range':0.4,
    'trend_reg':1,
    'trend_reg_threshold':False,
    'yearly_seasonality':True,
    'weekly_seasonality':True,
    'daily_seasonality':False,
    'seasonality_mode':'additive',
    'seasonality_reg':1,
    'n_forecasts':365,
    'normalize':'off'
}

# for pytorch / skorch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tcn_kwargs = {
#     'module': estimator, # will be handled at-call
    'criterion': nn.MSELoss, # consider enhancement here
    "lr": 0.01, # default is 0.01
    'optimizer':Adam,
    'max_epochs':10, # default is 10
    'device':device,
}

# model_params['hyperparams'] = str(neuralprophet_kwargs)
# model_params['holiday_source'] = 'Prophet builtin for each country'

In [39]:
xgboost_params = {
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
#     'eval_metric': ['mae', 'mape', 'rmse'],
    'learning_rate': .09,
    'max_depth': 0,
    'subsample': .15,
#     'sampling_method': 'gradient_based',
#     'seed': 42,
#     'grow_policy': 'lossguide',
    'max_leaves': 255,
    'lambda': 100,
#     'n_estimators': 3000,
#     'objective': 'reg:squarederror',
    'n_estimators': 50,
#     'verbose': True,
}


lightgbm_params = {
    'objective': 'mse',
    'random_state': 42,
    'device_type': 'cpu',
    'n_jobs': -1,
#                 eval_metric='auc',
#     'device_type': 'gpu',
#     'max_bin': 63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
#     'gpu_use_dp': False,
    'max_depth': 0,
    'learning_rate': 0.1,
    'subsample': .15,
    'n_estimators': 1500,
}

catboost_params = {
    'task_type':'GPU',
    'silent':True,
    'random_state':42,
}
                

In [40]:
class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
        super(TemporalBlock, self).__init__()
        
        # this is the first convolutional layer; note that it foregoes padding irrespective of argument
        self.conv1 = weight_norm(nn.Conv2d(n_inputs, n_outputs, (1, kernel_size),
                                           stride=stride, padding=0, dilation=dilation))
        # the padding is then added after the first conv layer
        self.pad = torch.nn.ZeroPad2d((padding, 0, 0, 0))
        # this is a very standard choice
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
        # the second convolutional layer in the block is identical to the first, but now padding has been added to the input
        self.conv2 = weight_norm(nn.Conv2d(n_outputs, n_outputs, (1, kernel_size),
                                           stride=stride, padding=0, dilation=dilation))
        
        # this simply strings together the above architectural elements, for convenience I guess
        self.net = nn.Sequential(self.pad, self.conv1, self.relu, self.dropout,
                                 self.pad, self.conv2, self.relu, self.dropout)
        
        # if the n_outputs is nonzero, this adds on a final convlutional layer to ensure that we get the desired number of outputs
        self.downsample = nn.Conv1d(
            n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
        self.relu = nn.ReLU()
        
        # this initializes the weights as specified in the separate weight initialization method, below
        self.init_weights()

    def init_weights(self):
        # this method initializes the weights for the Conv1D and Conv2D layers, plus the Downsample layer (if it's used)
        self.conv1.weight.data.normal_(0, 0.01)
        self.conv2.weight.data.normal_(0, 0.01)
        if self.downsample is not None:
            self.downsample.weight.data.normal_(0, 0.01)

    def forward(self, x):
        # note the nice one-liner here, to add in the requisite number of dimensions both inbound to the NN and outbound
        out = self.net(x.unsqueeze(2)).squeeze(2)
        # is this a residual, then?
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)

In [41]:
class TemporalConvNet(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
        super(TemporalConvNet, self).__init__()
        layers = []
        num_levels = len(num_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
                                     padding=(kernel_size-1) * dilation_size, dropout=dropout)]

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [42]:
class TCNModel(nn.Module):
    def __init__(self, num_channels, kernel_size=2, dropout=0.2):
        super(TCNModel, self).__init__()
        self.tcn = TemporalConvNet(
            128, num_channels, kernel_size=kernel_size, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.decoder = nn.Linear(num_channels[-1], 1)

    def forward(self, x):
        return self.decoder(self.dropout(self.tcn(x)[:, :, -1]))

In [43]:
prophet_folds = [
    ('2015-01-01', '2018-01-01'),
    ('2018-01-01', '2019-01-01'),
]

In [44]:
# prophet_tv_df = tv_df_encoded.copy() # encoded_tv_df.copy()
# prophet_test_df = test_df_encoded.copy() # encoded_test_df.copy()

In [45]:
# for feature in ['country', 'product', 'store']:
#     prophet_tv_df[feature] = orig_train_df[feature]
#     prophet_test_df[feature] = orig_test_df[feature]

In [46]:
# prophet_tv_df.head()

In [47]:
# countries_enc = le_dict['country'].transform(countries)
# stores_enc = le_dict['store'].transform(stores)
# products_enc = le_dict['product'].transform(products)

# countries, countries_enc

In [48]:
def neuralprophet_trainer(model_kwargs=neuralprophet_kwargs, countries=countries, stores=stores, products=products, folds=prophet_folds, 
                          tv_df=tv_df, test_df=test_df,
#                           df_train=tv_df, df_test=test_df, 
                          target='num_sold', wandb_tracked=False):
    train_smape = 0
    val_smape = 0
    
    # create local versions of the dataframes, to avoid mutation
    df_train = tv_df.copy()
    df_test = test_df.copy()
    
    if wandb_tracked:
#         exmodel_config['arch'] = arch
#         exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202201_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )
    
    # no label encoding here -- but test it with too
    for country in countries:
        for store in stores:
            for product in products:
                for fold, (start, end) in enumerate(folds):
                    # Skip iteration if it's the last fold
                    if fold == len(folds) - 1:
                        continue

                    # put only those rows in that are in the training window and have the correct country, store, and product
                    train_idx = (df_train['date'] >= start) &\
                                (df_train['date'] < end) &\
                                (df_train['country'] == country) &\
                                (df_train['store'] == store) &\
                                (df_train['product'] == product)

                    # redefine the training set in the local (holdout) sense
                    train = df_train.loc[train_idx, ['date', target]].reset_index(drop=True)

                    val_idx = (df_train['date'] >= folds[fold + 1][0]) &\
                              (df_train['date'] < folds[fold + 1][1]) &\
                              (df_train['country'] == country) &\
                              (df_train['store'] == store) &\
                              (df_train['product'] == product)

                    val = df_train.loc[val_idx, ['date', target]].reset_index(drop=True)

                    # rename the columns for standardization (this seems conventional)
                    train = train.rename(columns={'date': 'ds', target: 'y'})
                    val = val.rename(columns={'date': 'ds', target: 'y'})

#                     model = Prophet(**prophet_kwargs)
                    model = NeuralProphet(**model_kwargs)

                    model = model.add_country_holidays(country_name=country) # uses FacebookProphet or NeuralProphet API to add holidays
                    print(train.columns)
                    model.fit(train, freq='D') # neuralprophet
                    # prophet
#                     train_predictions = model.predict(train[['ds']])['yhat']
#                     val_predictions = model.predict(val[['ds']])['yhat']
                    # neuralprophet
                    train_predictions = model.predict(train)['yhat1']
                    val_predictions = model.predict(val)['yhat1']
                    df_train.loc[train_idx, 'neuralprophet_forecast'] = train_predictions.values
                    df_train.loc[val_idx, 'neuralprophet_forecast'] =  val_predictions.values

                    train_score = SMAPE(train['y'].values, train_predictions.values)
                    val_score = SMAPE(val['y'].values, val_predictions.values)
            
                    if wandb_tracked:
                        wandb.log({f"{(country,store,product)}_valid_smape": val_score})
            
                    train_smape += train_score
                    val_smape += val_score
            
                    print(f'\nTraining Range [{start}, {end}) - {country} - {store} - {product} - Train SMAPE: {train_score:4f}')
                    print(f'Validation Range [{folds[fold + 1][0]}, {folds[fold + 1][1]}) - {country} - {store} - {product} - Validation SMAPE: {val_score:4f}\n')

                    test_idx = (df_test['country'] == country) &\
                               (df_test['store'] == store) &\
                               (df_test['product'] == product)
                    test = df_test.loc[test_idx, ['date']].reset_index(drop=True)
                    
                    test = test.rename(columns={'date': 'ds'})
                    test['y'] = np.nan
                    test_predictions = model.predict(test)['yhat1']
                    
                    
                    df_test.loc[test_idx, 'neuralprophet_forecast'] = test_predictions.values
    
    train_smape /= (3*2*3)
    val_smape /= (3*2*3)
#     train_
    
    if wandb_tracked:
        wandb.log({'overall_train_smape': train_smape, 'overall_valid_smape': val_smape})
        wandb.finish()
    return df_train['neuralprophet_forecast'], df_test['neuralprophet_forecast']#, train_smape, val_smape

In [49]:
def prophet_trainer(prophet_kwargs=prophet_kwargs, countries=countries, stores=stores, products=products, folds=prophet_folds, 
                    tv_df=tv_df, test_df=test_df,
#                           df_train=tv_df, df_test=test_df, 
                    target='num_sold', wandb_tracked=False):
    train_smape = 0
    val_smape = 0
    
    # create local versions of the dataframes, to avoid mutation
    df_train = tv_df.copy()
    df_test = test_df.copy()
    
    if wandb_tracked:
#         exmodel_config['arch'] = arch
#         exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202201_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )
    
    for country in countries:
        for store in stores:
            for product in products:
                for fold, (start, end) in enumerate(folds):
                    # Skip iteration if it's the last fold
                    if fold == len(folds) - 1:
                        continue

                    # put only those rows in that are in the training window and have the correct country, store, and product
                    train_idx = (df_train['date'] >= start) &\
                                (df_train['date'] < end) &\
                                (df_train['country'] == country) &\
                                (df_train['store'] == store) &\
                                (df_train['product'] == product)
                    
#                     print(train_idx)

                    # redefine the training set in the local (holdout) sense
                    train = df_train.loc[train_idx, ['date', target]].reset_index(drop=True)
#                     print(train.shape)

                    val_idx = (df_train['date'] >= folds[fold + 1][0]) &\
                              (df_train['date'] < folds[fold + 1][1]) &\
                              (df_train['country'] == country) &\
                              (df_train['store'] == store) &\
                              (df_train['product'] == product)

                    val = df_train.loc[val_idx, ['date', target]].reset_index(drop=True)

                    # rename the columns for standardization (this seems conventional)
                    train = train.rename(columns={'date': 'ds', target: 'y'})
                    val = val.rename(columns={'date': 'ds', target: 'y'})

                    model = Prophet(**prophet_kwargs)

                    model.add_country_holidays(country_name=country) # uses FacebookProphet API to add holidays
                    model.fit(train)
        
                    train_predictions = model.predict(train[['ds']])['yhat']
                    val_predictions = model.predict(val[['ds']])['yhat']
                    df_train.loc[train_idx, 'prophet_forecast'] = train_predictions.values
                    df_train.loc[val_idx, 'prophet_forecast'] =  val_predictions.values

                    train_score = SMAPE(train['y'].values, train_predictions.values)
                    val_score = SMAPE(val['y'].values, val_predictions.values)
            
                    if wandb_tracked:
                        wandb.log({f"{(country,store,product)}_valid_smape": val_score})
            
                    train_smape += train_score
                    val_smape += val_score
            
                    print(f'\nTraining Range [{start}, {end}) - {country} - {store} - {product} - Train SMAPE: {train_score:4f}')
                    print(f'Validation Range [{folds[fold + 1][0]}, {folds[fold + 1][1]}) - {country} - {store} - {product} - Validation SMAPE: {val_score:4f}\n')

                    test_idx = (df_test['country'] == country) &\
                               (df_test['store'] == store) &\
                               (df_test['product'] == product)
                    test = df_test.loc[test_idx, ['date']].reset_index(drop=True)
                    
                    test = test.rename(columns={'date': 'ds'})
                    test_predictions = model.predict(test[['ds']])['yhat']
                    
                    
                    df_test.loc[test_idx, 'prophet_forecast'] = test_predictions.values
    
    train_smape /= (3*2*3)
    val_smape /= (3*2*3)
    
    if wandb_tracked:
        wandb.log({'overall_train_smape': train_smape, 'overall_valid_smape': val_smape})
        wandb.finish()
    return df_train['prophet_forecast'], df_test['prophet_forecast']#, train_smape, val_smape

In [50]:
def sklearn_trainer(estimator, model_kwargs={}, tv_df=tv_df, test_df=test_df, #X=X, y=y, X_valid=X_valid, y_valid=y_valid, X_test=X_test, 
                    folds=prophet_folds, countries=countries, stores=stores, products=products, target='target',
#                     by_combo=True, 
                    model_type=None, # None -> fully scikit-learn compatible; alternatives are 'skorch' or 'gbm'
                    wandb_tracked=False):
    
    # create local versions of the dataframes, to avoid mutation
    df_train = tv_df.copy()
    df_test = test_df.copy()
    
    # apply label encoding (which Scikit-Learn models require, but *Prophets don't)
    le_dict, tv_df = label_encoder(df_train) # should leave broader scope's tv_df alone
    _, test_df = label_encoder(df_test) # should leave broader scope's test_df alone
    del df_train, df_test
    
    # encode the lists of countries, stores, and products
    countries = le_dict['country'].transform(countries)
    stores = le_dict['store'].transform(stores)
    products = le_dict['product'].transform(products)
    
    train_smape = 0
    val_smape = 0
    
    if wandb_tracked:
#         exmodel_config['arch'] = arch
#         exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202201_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )
    
    # drop whichever version of the dependent variable is not being used
#     for df in [tv_df, test_df]:
    if target == 'num_sold': 
        tv_df = tv_df.drop(columns=['target'])
        test_df = test_df.drop(columns=['target'])
    else:
        tv_df = tv_df.drop(columns=['num_sold'])
        test_df = test_df.drop(columns=['num_sold'])
            
#     print("'num_sold' in test_df.columns == ", 'num_sold' in test_df.columns)
    
    # handling each combination of country, store, and product separately
    for country in countries:
        for store in stores:
            for product in products:
                for fold, (start, end) in enumerate(folds):
                    # Skip iteration if it's the last fold
                    if fold == len(folds) - 1:
                        continue

                    # put only those rows in that are in the training window and have the correct country, store, and product
                    train_idx = (tv_df['date'] >= start) &\
                                (tv_df['date'] < end) &\
                                (tv_df['country'] == country) &\
                                (tv_df['store'] == store) &\
                                (tv_df['product'] == product)

#                     print(train_idx)

                    # redefine the training set in the local (holdout) sense
                    train = tv_df.loc[train_idx, :].reset_index(drop=True)
#                         print(train.shape)

                    val_idx = (tv_df['date'] >= folds[fold + 1][0]) &\
                              (tv_df['date'] < folds[fold + 1][1]) &\
                              (tv_df['country'] == country) &\
                              (tv_df['store'] == store) &\
                              (tv_df['product'] == product)

                    val = tv_df.loc[val_idx, :].reset_index(drop=True)

                    test_idx = (test_df['country'] == country) &\
                               (test_df['store'] == store) &\
                               (test_df['product'] == product)
                    test = test_df.loc[test_idx, :].reset_index(drop=True)

                    # with the training and validation sets sorted out, make them integers for model fitting
                    for df in [train, val, test]:
                        df['date'] = df['date'].map(dt.datetime.toordinal)
                    if 'model_forecast' in train.columns:
                        X = train.drop(columns=[target, 'model_forecast'])
                        X_valid = val.drop(columns=[target, 'model_forecast'])
                        X_test = test.drop(columns=[target, 'model_forecast'])
                    else:
                        X = train.drop(columns=[target])
                        X_valid = val.drop(columns=[target])
                        X_test = test.drop(columns=[target])

                    y = train[target]
                    y_valid = val[target]


#                         print(type(X), type(y))
#                         print(f"X has {X.isna().any().sum()} NaNs")
#                         print(f"y has {y.isna().sum()} NaNs")
#                     print(X_test.info())
#                     print(y_valid.dtype)
    
                    if model_type == 'skorch':
#                         for df in [X, X_valid, X_test]:
# #                             df['date'] = df['date'].apply(dt.datetime.toordinal)
#                             df = torch.tensor(df.to_numpy(dtype=np.float32))
#                         for target in [y, y_valid]:
#                             target = torch.tensor(np.array(target))
# #                             target = target.reshape(-1,1)
#                             target = target.unsqueeze(0)
                        X = torch.tensor(X.to_numpy(dtype=np.float32))
                        X_valid = torch.tensor(X_valid.to_numpy(dtype=np.float32))
                        X_test = torch.tensor(X_test.to_numpy(dtype=np.float32))
            
                        y = torch.tensor(np.array(y)).reshape(-1,1)
                        y_valid = torch.tensor(np.array(y)).reshape(-1,1)
    
                        tcn_kwargs = {
                            'num_channels': X_valid.shape[0] * X_valid.shape[1]
                        }
                        print(type(y), type(y_valid))
#                         y = y.reshape(-1,1)
#                         y_valid = y_valid.reshape(-1,1)
                        # create the Datasets
                
                        # create the DataLoaders

                        # instantiate the wrapper
                        model = NeuralNetRegressor(
                            module=estimator(**tcn_kwargs),
                            **model_kwargs
                        )
#                     elif model_type=='gbm':
                        
                    else:
                        model = estimator(**model_kwargs)

                    model.fit(X,y)

                    model_train_preds = model.predict(X)
                    model_valid_preds = model.predict(X_valid)
                    model_test_preds = model.predict(X_test)

                    tv_df.loc[train_idx, 'model_forecast'] = model_train_preds#.values
                    tv_df.loc[val_idx, 'model_forecast'] =  model_valid_preds#.values
                    test_df.loc[test_idx, 'model_forecast'] = model_test_preds#.values


    # reverse the dependent variable transform if appropriate
    if target == 'target':
#             model_tv_preds = np.multiply(np.exp(model_tv_preds), tv_df['gdp']**gdp_exponent)
        tv_df['model_forecast'] = np.exp(tv_df['model_forecast']) * tv_df['gdp']**gdp_exponent
#             output_tv_df['model_forecast'] = np.exp(output_tv_df['model_forecast']) * output_tv_df['gdp']**gdp_exponent

#             model_test_preds = np.multiply(np.exp(model_test_preds), test_df['gdp']**gdp_exponent)
        test_df['model_forecast'] = np.exp(test_df['model_forecast']) * test_df['gdp']**gdp_exponent
#             output_test_df['model_forecast'] = np.exp(output_test_df['model_forecast']) * output_test_df['gdp']**gdp_exponent
#             model_test_preds = np.exp(model_test_preds) * test_df['gdp']**gdp_exponent
        
#         tv_df['model_forecast'] = model_tv_preds
#         test_df['model_forecast'] = model_test_preds
#     return output_tv_df, output_test_df
    return tv_df['model_forecast'], test_df['model_forecast']
#     return tv_df['model_forecast'], test_df['model_forecast']
#     return model_tv_preds, model_test_preds
    

In [51]:
from sklearn.model_selection import GroupKFold

In [52]:
def gbm_trainer(arch:str, model_kwargs={}, exmodel_config={}, tv_df=tv_df, test_df=test_df, #X=X, y=y, X_valid=X_valid, y_valid=y_valid, X_test=X_test, 
                countries=countries, stores=stores, products=products, 
                target='target', wandb_tracked=True, random_state=42):
    
    # create local versions of the dataframes, to avoid mutation
    X = tv_df.copy()
#     X_test = test_df.copy()
    
    # apply label encoding (which Scikit-Learn models require, but *Prophets don't)
    le_dict, X = label_encoder(X) # should leave broader scope's tv_df alone
#     _, X_test = label_encoder(X_test) # should leave broader scope's test_df alone
#     del df_train, df_test
    
    # encode the lists of countries, stores, and products
    countries = le_dict['country'].transform(countries)
    stores = le_dict['store'].transform(stores)
    products = le_dict['product'].transform(products)
    
#     train_smape = 0
#     val_smape = 0
    
    if wandb_tracked:
#         exmodel_config['arch'] = arch
#         exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202201_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
        )
    
    # drop whichever version of the dependent variable is not being used
#     for df in [tv_df, test_df]:
    y = X[target]
#     for df in [X, X_test]:
#         df = df.drop(columns=['num_sold', 'target'])
    X = X.drop(columns=['num_sold', 'target'])
#     X = X.drop(columns)
#     if target == 'num_sold': 
#         y = X['num_sold']
#         X = X.drop(columns=['target'])
#         X_test = X_test.drop(columns=['target'])
#     else:
#         X = X.drop(columns=['num_sold'])
#         X_test = X_test.drop(columns=['num_sold'])
    
    kfold = GroupKFold(n_splits=4)
    oof_preds = pd.Series(0, index=tv_df.index)
#     oof_preds, oof_y = [], []
    
#     test_preds = np.zeros((X_test.shape[0]))
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X, groups=X.date.dt.year)):
        print(f"FOLD {fold}")
        print("------------------------------")
        
        # remove dates 
#         for df in [X, X_test]:
#             df = df.drop(columns=['date'])
        if 'date' in X.columns:
            X = X.drop(columns=['date'])
#             X_test = X_test.drop(columns=['date'])#, 'num_sold'])
        
        y_train, y_valid = y[train_ids], y[valid_ids]
        X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:]
        
        if arch == 'xgboost':
            model = XGBRegressor(
                tree_method='gpu_hist',
                predictor= 'gpu_predictor',
                eval_metric=['mae', 'mape'],
                sampling_method='gradient_based',
                seed=42,
                grow_policy='lossguide',
                objective='reg:squarederror',
                **model_kwargs)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
            else:
                model.fit(X_train, y_train)
        elif arch == 'lightgbm':
            model = LGBRegressor(**model_kwargs)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()])
            else:
                model.fit(X_train, y_train)
        elif arch == 'catboost':
            model = CatBoostRegressor(
                task_type='GPU',
                verbose= False,
                random_state=42,
                **model_kwargs)
            
        model.fit(X_train, y_train)
        
        y_valid_preds = model.predict(X_valid)
        
#         oof_preds.extend(y_valid_preds)
#         oof_y.extend(y_valid)
        oof_preds[valid_ids] = y_valid_preds
                
    if target == 'target':
        oof_preds = np.exp(oof_preds) * tv_df['gdp']**gdp_exponent
#         oof_y = np.exp(tv_df[target]) * tv_df['gdp']**gdp_exponent
#         test_preds = np.exp(test_preds) * test_df['gdp']**gdp_exponent

#     return oof_preds, test_preds
    smape = SMAPE(y_pred=oof_preds, y_true=tv_df['num_sold'])
#     print("Lengths of oof_preds and tv_df[target] are same? ", len(oof_preds) == len(tv_df[target]))
#     print(oof_preds[:10])
#     print(tv_df[target][:10])
    if wandb_tracked:
        wandb.log({
            'arch': arch,
            'SMAPE': smape,
            'model_params': str(model_kwargs),
            'model_seed': random_state
        })
        wandb.finish()
    return smape

In [53]:
# %%time 
# smape = gbm_trainer(arch='xgboost', model_kwargs=xgboost_params, wandb_tracked=False)
# smape

In [54]:
# hyperparameter tuning
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
from optuna.samplers import TPESampler

# tracking 
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
os.environ['WANDB_NOTEBOOK_NAME'] = f"optuna_forecasting_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [55]:
exmodel_config = {
    'cross-validation': 'GroupKFold(n_split=4)',
}

In [56]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['study', 'gbms'],
    'notes': "Optuna study of forecasting methods (including the time-stripped GBMs)"
}

In [57]:
arch = 'xgboost'

In [58]:
# originally from https://www.kaggle.com/satorushibata/optimize-catboost-hyperparameter-with-optuna-gpu
def objective(trial, arch=arch):#, tune_fold=tune_fold):
    """
    Wrapper around cross_validation_trainer to test different model hyperparameters
    """
    
    if arch == 'catboost':
        model_params = {
            'iterations' : trial.suggest_int('iterations', 2000, 30000),                         
            'depth' : trial.suggest_int('depth', 3, 10),                                       
            'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.5),               
            'random_strength': trial.suggest_int('random_strength', 0, 100), 
    #         'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
    #         'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['MVC', 'Bernoulli']),#, 'Poisson']),
            'od_wait': trial.suggest_int('od_wait', 20, 2000),
            'reg_lambda': trial.suggest_uniform('reg_lambda', 2, 70), # aka l2_leaf_reg
            'border_count': trial.suggest_int('border_count', 50, 275),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 20), # aka min_data_in_leaf
            'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 5),
#             'task_type':'GPU',
#             'verbose': False,
# #             'silent':True,
#             'random_state':42,
            # 'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
    #         'subsample': trial.suggest_uniform('subsample', 0.5, 1),
    #         'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
            # 'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
    #         'max_leaves': trial.suggest_int('max_leaves', 32, 128)
        }
        
    elif arch == 'lightgbm':
        pass # todo -- fill in tomorrow
        
    elif arch == 'xgboost':
        model_params = {
            'n_estimators': trial.suggest_int('n_estimators', 500, 10000), # was 900-4500 for CPU
            'max_depth' : trial.suggest_int('depth', 3, 10),                                       
            'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.3),               
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 50),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 30),
            'subsample': trial.suggest_uniform('subsample', 0.1, 1),
    #         'booster': trial.suggest_categorical('boosting_type', ['gbtree', 'dart']),
            'min_child_weight': trial.suggest_uniform('min_child_weight', 0.001, 12),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
            'gamma': trial.suggest_uniform('gamma', 0.1, 10),
#             'tree_method': 'gpu_hist',
#             'predictor': 'gpu_predictor',
#             'eval_metric': ['mae', 'mape'],
#             'sampling_method': 'gradient_based',
#             'seed': 42,
#             'grow_policy': 'lossguide',
#             'max_leaves': 255,
#             'lambda': 100,
#     'n_estimators': 3000,
#             'objective': 'reg:squarederror',
#             'n_estimators': 500,
#     'verbose': True,
            
        } 
    
    return gbm_trainer(arch=arch, model_kwargs=model_params, wandb_tracked=False)#, telegram=False)

In [59]:
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_config)

In [60]:
arch = 'xgboost'

In [61]:
SEED = 42

In [62]:
start_time = datetime.now().strftime('%Y%m%d%H%M%S')
study = optuna.create_study(direction = "minimize", 
                            sampler = TPESampler(seed=int(SEED)), 
                            study_name=f"{arch}_study-{start_time}")

In [63]:
%%time 
for x in range(1, 500):
    study.optimize(objective, n_trials = 1, callbacks = [wandbc], show_progress_bar=False)#, catch=(xgboost.core.XGBoostError,)) 
    dump(study, filename=studypath/f"optuna_{arch}_study-{start_time}.joblib")

In [64]:
wandb.log({'best_params': str(study.best_trial.params),
#            'trials_in_run': len(study.trials),
           'trials_in_study': len(study.trials)
          })
wandb.finish()

In [65]:
study.best_trial.params

{'n_estimators': 4207,
 'depth': 5,
 'learning_rate': 0.05378597302351865,
 'reg_alpha': 0.0067949392113948815,
 'reg_lambda': 0.04865823628931899,
 'subsample': 0.212875760245356,
 'min_child_weight': 6.997692447967251,
 'colsample_bytree': 0.9824893256584818,
 'gamma': 0.10395228539921328}

In [66]:
optuna.visualization.plot_parallel_coordinate(study)

In [67]:
#### CatBoost Study

In [68]:
arch = 'catboost'

In [69]:
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_config)

In [70]:
# originally from https://www.kaggle.com/satorushibata/optimize-catboost-hyperparameter-with-optuna-gpu
def objective(trial, arch=arch):#, tune_fold=tune_fold):
    """
    Wrapper around cross_validation_trainer to test different model hyperparameters
    """
    
    if arch == 'catboost':
        model_params = {
            'iterations' : trial.suggest_int('iterations', 2000, 30000),                         
            'depth' : trial.suggest_int('depth', 3, 10),                                       
            'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.5),               
            'random_strength': trial.suggest_int('random_strength', 0, 100), 
    #         'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
    #         'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['MVC', 'Bernoulli']),#, 'Poisson']),
            'od_wait': trial.suggest_int('od_wait', 20, 2000),
            'reg_lambda': trial.suggest_uniform('reg_lambda', 2, 70), # aka l2_leaf_reg
            'border_count': trial.suggest_int('border_count', 50, 275),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 20), # aka min_data_in_leaf
            'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 5),
#             'task_type':'GPU',
#             'verbose': False,
# #             'silent':True,
#             'random_state':42,
            # 'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
    #         'subsample': trial.suggest_uniform('subsample', 0.5, 1),
    #         'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
            # 'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
    #         'max_leaves': trial.suggest_int('max_leaves', 32, 128)
        }
        
    elif arch == 'lightgbm':
        pass # todo -- fill in tomorrow
        
    elif arch == 'xgboost':
        model_params = {
            'n_estimators': trial.suggest_int('n_estimators', 500, 10000), # was 900-4500 for CPU
            'max_depth' : trial.suggest_int('depth', 3, 10),                                       
            'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.3),               
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 50),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 30),
            'subsample': trial.suggest_uniform('subsample', 0.1, 1),
    #         'booster': trial.suggest_categorical('boosting_type', ['gbtree', 'dart']),
            'min_child_weight': trial.suggest_uniform('min_child_weight', 0.001, 12),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
            'gamma': trial.suggest_uniform('gamma', 0.1, 10),
#             'tree_method': 'gpu_hist',
#             'predictor': 'gpu_predictor',
#             'eval_metric': ['mae', 'mape'],
#             'sampling_method': 'gradient_based',
#             'seed': 42,
#             'grow_policy': 'lossguide',
#             'max_leaves': 255,
#             'lambda': 100,
#     'n_estimators': 3000,
#             'objective': 'reg:squarederror',
#             'n_estimators': 500,
#     'verbose': True,
            
        } 
    
    return gbm_trainer(arch=arch, model_kwargs=model_params, wandb_tracked=False)#, telegram=False)

In [71]:
start_time = datetime.now().strftime('%Y%m%d%H%M%S')
study = optuna.create_study(direction = "minimize", 
                            sampler = TPESampler(seed=int(SEED)), 
                            study_name=f"{arch}_study-{start_time}")

In [72]:
%%time
for x in range(1, 500):
    study.optimize(objective, n_trials = 1, callbacks = [wandbc], show_progress_bar=False)#, catch=(xgboost.core.XGBoostError,)) 
    dump(study, filename=studypath/f"optuna_{arch}_study-{start_time}.joblib")

In [73]:
wandb.log({'best_params': str(study.best_trial.params),
#            'trials_in_run': len(study.trials),
           'trials_in_study': len(study.trials)
          })
wandb.finish()

In [74]:
study.best_trial.params

{'iterations': 10529,
 'depth': 3,
 'learning_rate': 0.07026263205443048,
 'random_strength': 44,
 'od_wait': 261,
 'reg_lambda': 35.672029887566374,
 'border_count': 57,
 'min_child_samples': 19,
 'leaf_estimation_iterations': 2}

In [75]:
%%time
for x in range(1, 500):
    study.optimize(objective, n_trials = 1, callbacks = [wandbc], show_progress_bar=False)#, catch=(xgboost.core.XGBoostError,)) 
    dump(study, filename=studypath/f"optuna_{arch}_study-{start_time}.joblib")

In [76]:
wandb.log({'best_params': str(study.best_trial.params),
#            'trials_in_run': len(study.trials),
           'trials_in_study': len(study.trials)
          })
wandb.finish()

In [77]:
wandb.log({'best_params': str(study.best_trial.params),
#            'trials_in_run': len(study.trials),
           'trials_in_study': len(study.trials)
          })
wandb.finish()

In [78]:
study = load(studypath/'optuna_catboost_study-20220127082356.joblib')

In [79]:
study.best_trial_params

In [80]:
study.best_trial.params

{'iterations': 10529,
 'depth': 3,
 'learning_rate': 0.07026263205443048,
 'random_strength': 44,
 'od_wait': 261,
 'reg_lambda': 35.672029887566374,
 'border_count': 57,
 'min_child_samples': 19,
 'leaf_estimation_iterations': 2}

In [81]:
def gbm_trainer(arch:str, model_kwargs={}, exmodel_config={}, tv_df=tv_df, test_df=test_df, #X=X, y=y, X_valid=X_valid, y_valid=y_valid, X_test=X_test, 
                countries=countries, stores=stores, products=products, 
                target='target', wandb_tracked=True, random_state=42):
    
    # create local versions of the dataframes, to avoid mutation
    X = tv_df.copy()
#     X_test = test_df.copy()
    
    # apply label encoding (which Scikit-Learn models require, but *Prophets don't)
    le_dict, X = label_encoder(X) # should leave broader scope's tv_df alone
#     _, X_test = label_encoder(X_test) # should leave broader scope's test_df alone
#     del df_train, df_test
    
    # encode the lists of countries, stores, and products
    countries = le_dict['country'].transform(countries)
    stores = le_dict['store'].transform(stores)
    products = le_dict['product'].transform(products)
    
#     train_smape = 0
#     val_smape = 0
    
    if wandb_tracked:
#         exmodel_config['arch'] = arch
#         exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202201_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
        )
    
    # drop whichever version of the dependent variable is not being used
#     for df in [tv_df, test_df]:
    y = X[target]
#     for df in [X, X_test]:
#         df = df.drop(columns=['num_sold', 'target'])
    X = X.drop(columns=['num_sold', 'target'])
#     X = X.drop(columns)
#     if target == 'num_sold': 
#         y = X['num_sold']
#         X = X.drop(columns=['target'])
#         X_test = X_test.drop(columns=['target'])
#     else:
#         X = X.drop(columns=['num_sold'])
#         X_test = X_test.drop(columns=['num_sold'])
    
    kfold = GroupKFold(n_splits=4)
    oof_preds = pd.Series(0, index=tv_df.index)
#     oof_preds, oof_y = [], []
    
#     test_preds = np.zeros((X_test.shape[0]))
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X, groups=X.date.dt.year)):
        print(f"FOLD {fold}")
        print("------------------------------")
        
        # remove dates 
#         for df in [X, X_test]:
#             df = df.drop(columns=['date'])
        if 'date' in X.columns:
            X = X.drop(columns=['date'])
#             X_test = X_test.drop(columns=['date'])#, 'num_sold'])
        
        y_train, y_valid = y[train_ids], y[valid_ids]
        X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:]
        
        if arch == 'xgboost':
            model = XGBRegressor(
                tree_method='gpu_hist',
                predictor= 'gpu_predictor',
                eval_metric=['mae', 'mape'],
                sampling_method='gradient_based',
                seed=42,
                grow_policy='lossguide',
                objective='reg:squarederror',
                **model_kwargs)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
            else:
                model.fit(X_train, y_train)
        elif arch == 'lightgbm':
            model = LGBMRegressor(**model_kwargs)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()])
            else:
                model.fit(X_train, y_train)
        elif arch == 'catboost':
            model = CatBoostRegressor(
                task_type='GPU',
                verbose= False,
                random_state=42,
                **model_kwargs)
            
        model.fit(X_train, y_train)
        
        y_valid_preds = model.predict(X_valid)
        
#         oof_preds.extend(y_valid_preds)
#         oof_y.extend(y_valid)
        oof_preds[valid_ids] = y_valid_preds
                
    if target == 'target':
        oof_preds = np.exp(oof_preds) * tv_df['gdp']**gdp_exponent
#         oof_y = np.exp(tv_df[target]) * tv_df['gdp']**gdp_exponent
#         test_preds = np.exp(test_preds) * test_df['gdp']**gdp_exponent

#     return oof_preds, test_preds
    smape = SMAPE(y_pred=oof_preds, y_true=tv_df['num_sold'])
#     print("Lengths of oof_preds and tv_df[target] are same? ", len(oof_preds) == len(tv_df[target]))
#     print(oof_preds[:10])
#     print(tv_df[target][:10])
    if wandb_tracked:
        wandb.log({
            'arch': arch,
            'SMAPE': smape,
            'model_params': str(model_kwargs),
            'model_seed': random_state
        })
        wandb.finish()
    return smape

In [82]:
best_xgboost_params = load(studypath/'optuna_xgboost_study-20220126213551.joblib').best_trial.params
best_xgboost_params['max_depth'] = best_xgboost_params['depth']
del best_xgboost_params['depth']
best_xgboost_params

{'n_estimators': 4207,
 'learning_rate': 0.05378597302351865,
 'reg_alpha': 0.0067949392113948815,
 'reg_lambda': 0.04865823628931899,
 'subsample': 0.212875760245356,
 'min_child_weight': 6.997692447967251,
 'colsample_bytree': 0.9824893256584818,
 'gamma': 0.10395228539921328,
 'max_depth': 5}

In [83]:
gbm_trainer(arch='xgboost', model_kwargs=best_xgboost_params)