In [1]:
# notebook configuration
# if '/sf/' in pwd:
#     COLAB, SAGE = False, False
# elif 'google.colab' in str(get_ipython()):
#     COLAB, SAGE = True, False # do colab-specific installs later
# else:
#     COLAB, SAGE = False, True
    
CONTEXT = 'local' # or 'colab', 'sage', 'kaggle'
USE_GPU = True 
%config Completer.use_jedi = False

## Imports

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import requests # for telegram notifications
from tqdm.notebook import tqdm

from joblib import dump, load

Now, non-stdlib imports

In [3]:
# model selection
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

# metrics
# from sklearn.metrics import accuracy_score#, log_loss, roc_auc_score

# eda
import missingno
# import doubtlab 

# data cleaning
# from sklearn.impute import SimpleImputer #, KNNImputer
# import cleanlab

# normalization
# from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
# from gauss_rank_scaler import GaussRankScaler

# feature generation
# from sklearn.preprocessing import PolynomialFeatures
# import category_encoders as ce

# models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier

# feature reduction
# from sklearn.decomposition import PCA
# from umap import UMAP

# clustering
# from sklearn.cluster import DBSCAN, KMeans
# import hdbscan

# feature selection
# from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
# import featuretools as ft
# from BorutaShap import BorutaShap
# from boruta import BorutaPy

# tracking 
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
os.environ['WANDB_NOTEBOOK_NAME'] = f"nb_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# deep learning
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR

# widedeep
# from pytorch_widedeep import Trainer
# from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
# from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT#, TabTransformer, TabNet, TabFastFormer, TabResnet
# from pytorch_widedeep.metrics import Accuracy
# from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

In [5]:
# time series
import tsfresh

import darts
from darts import TimeSeries

In [6]:
from darts.models import ExponentialSmoothing, AutoARIMA, ARIMA, Prophet, RandomForest, RegressionEnsembleModel, RegressionModel, TFTModel, TCNModel, TransformerModel, NBEATSModel

## Routing

Now, datapath setup

In [7]:
if CONTEXT == 'colab':
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    # datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/dec2021/')
    root = Path('') # TODO

elif CONTEXT == 'sage':
    root = Path('') # TODO
    
elif CONTEXT == 'kaggle':
    root = Path('') # TODO
    
else: # if on local machine
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/jan2022/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    studypath = root/'studies'
    
    for pth in [datapath, predpath, subpath, studypath]:
        pth.mkdir(exist_ok=True)

## Helpers

In [8]:
SEED = 42

# Function to seed everything but the models
def seed_everything(seed, pytorch=True, reproducible=True):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if pytorch:
        torch.manual_seed(seed) # set torch CPU seed
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed) # set torch GPU(s) seed(s)
        if reproducible and torch.backends.cudnn.is_available():
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

seed_everything(seed=SEED)

In [9]:
def reduce_memory_usage(df, verbose=True):
    """
    Function to reduce memory usage by downcasting datatypes in a Pandas DataFrame when possible.
    
    h/t to Bryan Arnold (https://www.kaggle.com/puremath86/label-correction-experiments-tps-nov-21)
    """
    
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [10]:
tg_api_token = 'your_api_token' # for Galileo (jupyter_watcher_bot) on Telegram
tg_chat_id = 'your_chat_id'

import requests

def send_tg_message(text='Cell execution completed.'):  
    """
    h/t Ivan Dembicki Jr. for the base version 
    (https://medium.com/@ivan.dembicki.jr/notifications-in-jupyter-notebook-with-telegram-f2e892c55173)
    """
    requests.post('https://api.telegram.org/' +  'bot{}/sendMessage'.format(tg_api_token),
                  params=dict(chat_id=tg_chat_id, text=text))

In [11]:
def SMAPE(y_true, y_pred):
    '''
    h/t Jean-François Puget (@CPMP) -- see https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
    '''
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

# EDA (Comp)

In [12]:
prophet_preds = pd.read_csv(subpath/'LB12.96585__20220121_prophet_baseline_preds.csv')['num_sold']
neural_preds = pd.read_csv(subpath/'LB9.97748__20220121_neuralprophet_baseline_preds.csv')['num_sold']

In [13]:
df = pd.DataFrame({
    'prophet': prophet_preds,
    'neural_preds': neural_preds
})

In [14]:
df.corr()

Unnamed: 0,prophet,neural_preds
prophet,1.0,0.991902
neural_preds,0.991902,1.0


So these predictions are pretty highly correlated. How do they compare with the ridge regression preds?

In [15]:
ridge_preds = pd.read_csv(subpath/'LB19.29136__20220120_ridge_baseline_rounded_preds.csv')['num_sold']

In [16]:
df['ridge'] = ridge_preds

In [17]:
df

Unnamed: 0,prophet,neural_preds,ridge
0,412.359844,399.074371,331
1,647.046273,580.306213,530
2,174.315373,175.331146,154
3,715.824645,680.846680,580
4,1112.556479,1002.552185,929
...,...,...,...
6565,887.296786,698.816772,616
6566,257.187475,239.840546,179
6567,1041.756321,751.112549,676
6568,1541.823842,1036.387329,1079


In [18]:
df.corr()

Unnamed: 0,prophet,neural_preds,ridge
prophet,1.0,0.991902,0.954771
neural_preds,0.991902,1.0,0.945984
ridge,0.954771,0.945984,1.0


So the ridge regression is closer to prophet than neural prophet, but not particularly close to either -- and the prophets are quite close to one another.

In [19]:
neural_trainset = load(predpath/'20220121_neuralprophet_baseline_trainset.joblib')

In [20]:
prophet_trainset = load(predpath/'20220121_prophet_baseline_trainset.joblib')

In [21]:
neural_trainset

Unnamed: 0,row_id,date,country,store,product,num_sold,prophet_forecast
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329,337.860291
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520,471.527588
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146,146.239853
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572,571.667114
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911,800.308228
...,...,...,...,...,...,...,...
26293,26293,2018-12-31,Sweden,KaggleMart,Kaggle Hat,823,688.424316
26294,26294,2018-12-31,Sweden,KaggleMart,Kaggle Sticker,250,234.144318
26295,26295,2018-12-31,Sweden,KaggleRama,Kaggle Mug,1004,734.039551
26296,26296,2018-12-31,Sweden,KaggleRama,Kaggle Hat,1441,1014.238037


In [28]:
neural_tv_preds = neural_trainset['prophet_forecast']

In [27]:
prophet_tv_preds = prophet_trainset['prophet_forecast']

In [29]:
train_length = len(neural_trainset[neural_trainset['date'] > '2017-12-31'])

In [30]:
neural_train_preds = neural_tv_preds[:train_length]
neural_valid_preds = neural_tv_preds[train_length:]

prophet_train_preds = prophet_tv_preds[:train_length]
prophet_valid_preds = prophet_tv_preds[train_length:]


Let's just compare the 2017 preds

In [31]:
valid_preds = pd.DataFrame({
    'neural_prophet': neural_valid_preds,
    'prophet': prophet_valid_preds
})

In [32]:
valid_preds.corr()

Unnamed: 0,neural_prophet,prophet
neural_prophet,1.0,0.989519
prophet,0.989519,1.0
