This solution is built on the following notebooks. 

Reference: 
1. https://www.kaggle.com/code/mayer79/m5-forecast-poisson-loss-top-10
2. https://www.kaggle.com/code/anshuls235/time-series-forecasting-eda-fe-modelling

# Import packages

In [37]:
import pandas as pd
import numpy as np
import os

import models
import utils

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Notebook settings

In [38]:
debug = False   # Use subsample data if debug=True

# Load data

In [39]:
data_dir = "data"
submission_dir = "submissions"
os.makedirs(submission_dir, exist_ok=True)

calendar = pd.read_csv(os.path.join(data_dir, "calendar.csv"))
selling_prices = pd.read_csv(os.path.join(data_dir, "sell_prices.csv"))
sample_submission_accuracy = pd.read_csv(os.path.join(data_dir, "sample_submission_accuracy.csv"))
sample_submission_uncertainty = pd.read_csv(os.path.join(data_dir, "sample_submission_uncertainty.csv"))
sales = pd.read_csv(os.path.join(data_dir, "sales_train_evaluation.csv"))

In [40]:
sample_submission_accuracy['id']

0        HOBBIES_1_001_CA_1_validation
1        HOBBIES_1_002_CA_1_validation
2        HOBBIES_1_003_CA_1_validation
3        HOBBIES_1_004_CA_1_validation
4        HOBBIES_1_005_CA_1_validation
                     ...              
60975      FOODS_3_823_WI_3_evaluation
60976      FOODS_3_824_WI_3_evaluation
60977      FOODS_3_825_WI_3_evaluation
60978      FOODS_3_826_WI_3_evaluation
60979      FOODS_3_827_WI_3_evaluation
Name: id, Length: 60980, dtype: object

In [41]:
sample_submission_uncertainty[(sample_submission_uncertainty['id'].str.startswith('HOBBIES_1_001_CA_1'))]['id']

12350     HOBBIES_1_001_CA_1_0.005_validation
55190     HOBBIES_1_001_CA_1_0.025_validation
98030     HOBBIES_1_001_CA_1_0.165_validation
140870    HOBBIES_1_001_CA_1_0.250_validation
183710    HOBBIES_1_001_CA_1_0.500_validation
226550    HOBBIES_1_001_CA_1_0.750_validation
269390    HOBBIES_1_001_CA_1_0.835_validation
312230    HOBBIES_1_001_CA_1_0.975_validation
355070    HOBBIES_1_001_CA_1_0.995_validation
397910    HOBBIES_1_001_CA_1_0.005_evaluation
440750    HOBBIES_1_001_CA_1_0.025_evaluation
483590    HOBBIES_1_001_CA_1_0.165_evaluation
526430    HOBBIES_1_001_CA_1_0.250_evaluation
569270    HOBBIES_1_001_CA_1_0.500_evaluation
612110    HOBBIES_1_001_CA_1_0.750_evaluation
654950    HOBBIES_1_001_CA_1_0.835_evaluation
697790    HOBBIES_1_001_CA_1_0.975_evaluation
740630    HOBBIES_1_001_CA_1_0.995_evaluation
Name: id, dtype: object

In [42]:
# Quantiles required by M5 uncertainty
q_levels = [0.005, 0.025, 0.165, 0.25, 0.5, 0.750, 0.835, 0.975, 0.995]

## Prepare data

In [43]:
calendar = utils.prep_calendar(calendar)
calendar.head()

Unnamed: 0,wm_yr_wk,wday,month,year,d,event_name_1,event_name_2,snap_CA,snap_TX,snap_WI
0,11101,1,1,2011,1,1,1,0,0,0
1,11101,2,1,2011,2,1,1,0,0,0
2,11101,3,1,2011,3,1,1,0,0,0
3,11101,4,2,2011,4,1,1,1,1,0
4,11101,5,2,2011,5,1,1,1,0,1


In [44]:
x_train, x_valid, y_train, y_valid, test, features = utils.prepare_training_data(sales, calendar, selling_prices, drop_d=1000-28)

In [45]:
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape, test.shape

((25053633, 20), (2783737, 20), (25053633,), (2783737,), (3414880, 23))

In [46]:
x_valid

Unnamed: 0,dept_id,lag_t7,rolling_mean_lag28_w28,rolling_mean_lag7_w7,event_name_1,state_id,store_id,month,wday,cat_id,rolling_mean_lag7_w28,year,lag_t28,event_name_2,snap_TX,snap_WI,snap_CA,item_id,sell_price,rolling_mean_lag28_w7
11504520,6,0.0,0.142857,0.000000,1,1,4,12,6,3,0.142857,2014,0.0,1,0,0,1,2081,7.98,0.142857
24160035,3,0.0,0.642857,0.285714,1,1,4,1,1,1,0.428571,2016,0.0,1,0,0,0,1197,1.00,0.142857
25700385,4,0.0,0.214286,0.142857,1,3,10,3,2,2,0.214286,2016,0.0,1,1,0,0,1802,14.72,0.285714
7027236,3,0.0,0.000000,0.000000,1,2,5,7,6,1,0.000000,2014,0.0,1,0,0,1,729,1.18,0.000000
6939249,3,2.0,0.428571,0.714286,1,2,6,7,3,1,0.607143,2014,0.0,1,1,0,1,1163,2.98,0.142857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20598905,3,0.0,0.071429,0.285714,1,2,6,9,3,1,0.357143,2015,0.0,1,0,0,0,1299,3.57,0.285714
27382803,3,0.0,0.928571,0.857143,17,1,1,5,2,1,0.857143,2016,1.0,1,0,1,1,1172,3.00,0.857143
8391450,6,0.0,3.821429,2.857143,1,1,3,8,2,3,3.535714,2014,9.0,1,0,0,0,2040,2.97,3.285714
21405768,1,1.0,0.821429,0.428571,1,1,1,10,2,1,0.750000,2015,0.0,1,0,0,0,177,9.97,1.285714


In [47]:
y_valid

11504520    1.0
24160035    1.0
25700385    0.0
7027236     0.0
6939249     1.0
           ... 
20598905    0.0
27382803    1.0
8391450     1.0
21405768    0.0
19874504    0.0
Name: demand, Length: 2783737, dtype: float32

In [48]:
test

Unnamed: 0,id,item_id,store_id,state_id,dept_id,cat_id,d,demand,lag_t7,rolling_mean_lag7_w7,...,rolling_mean_lag28_w28,wday,month,year,event_name_1,event_name_2,snap_CA,snap_TX,snap_WI,sell_price
25276210,HOBBIES_1_001_CA_1,1438,1,1,4,2,1858,0.0,4.0,1.142857,...,0.785714,3,2,2016,1,1,0,0,0,8.26
25276211,HOBBIES_1_002_CA_1,1439,1,1,4,2,1858,0.0,0.0,0.285714,...,0.178571,3,2,2016,1,1,0,0,0,3.97
25276212,HOBBIES_1_003_CA_1,1440,1,1,4,2,1858,0.0,0.0,0.428571,...,0.107143,3,2,2016,1,1,0,0,0,2.97
25276213,HOBBIES_1_004_CA_1,1441,1,1,4,2,1858,0.0,2.0,1.857143,...,2.071429,3,2,2016,1,1,0,0,0,4.64
25276214,HOBBIES_1_005_CA_1,1442,1,1,4,2,1858,1.0,0.0,0.857143,...,0.750000,3,2,2016,1,1,0,0,0,2.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28691085,FOODS_3_823_WI_3,1433,10,3,3,1,1969,,,,...,0.642857,2,6,2016,18,4,0,0,0,2.98
28691086,FOODS_3_824_WI_3,1434,10,3,3,1,1969,,,,...,0.285714,2,6,2016,18,4,0,0,0,2.48
28691087,FOODS_3_825_WI_3,1435,10,3,3,1,1969,,,,...,0.785714,2,6,2016,18,4,0,0,0,3.98
28691088,FOODS_3_826_WI_3,1436,10,3,3,1,1969,,,,...,1.321429,2,6,2016,18,4,0,0,0,1.28


In [49]:
if debug:
    x_train, y_train = x_train[:100000], y_train[:100000]
    x_valid, y_valid = x_valid[:10000], y_valid[:10000]

# Training and Evaluation

## Mean prediction

In [50]:
lgbm = models.LGBM().fit(x_train, x_valid, y_train, y_valid)

[LightGBM] [Info] Total Bins 1782
[LightGBM] [Info] Number of data points in the train set: 25053633, number of used features: 20
[LightGBM] [Info] Start training from score 0.206062
[LightGBM] [Info] Start training from score 0.206062
Training until validation scores don't improve for 200 rounds
Training until validation scores don't improve for 200 rounds
[100]	valid_0's rmse: 2.2626
[100]	valid_0's rmse: 2.2626
[200]	valid_0's rmse: 2.22463
[200]	valid_0's rmse: 2.22463
[300]	valid_0's rmse: 2.19782
[300]	valid_0's rmse: 2.19782
[400]	valid_0's rmse: 2.18319
[400]	valid_0's rmse: 2.18319
[500]	valid_0's rmse: 2.17208
[500]	valid_0's rmse: 2.17208


KeyboardInterrupt: 

In [None]:
pred = utils.forecast_point_horizon(lgbm, test, features)

In [None]:
pd.concat([test[test['d'] >= utils.FIRST]['demand'], pred[pred['d'] >= utils.FIRST]['demand']], axis=1)

In [None]:
utils.save_accuracy_submission(
    pred, 
    cols_template=sample_submission_accuracy, 
    filepath=os.path.join(submission_dir, "submission_accuracy.csv")
)

In [None]:
submission_accuracy = pd.read_csv(os.path.join(submission_dir, "submission_accuracy.csv"))

submission_accuracy

## Uncertainty (quantile) prediction

In [None]:
# Train distributional LightGBM model (Negative Binomial) and inspect quantiles
dist_lgbm = models.DistributionalLGBM().fit(x_train, x_valid, y_train, y_valid)

In [None]:
# Recursive quantile forecasts
pred_per_q_dist_lgbm = utils.forecast_quantile_horizon(dist_lgbm, test, features, q_levels)

In [None]:
# Convert to submission file format
utils.save_uncertainty_submission(
    pred_per_q_dist_lgbm,
    cols_template=sample_submission_uncertainty,
    filepath=os.path.join(submission_dir, "submission_uncertainty_dist_lgbm.csv")
)

In [None]:
submission_uncertainty_dist_lgbm = pd.read_csv(os.path.join(submission_dir, "submission_uncertainty_dist_lgbm.csv"))

submission_uncertainty_dist_lgbm

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_0.005_evaluation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,FOODS_1_001_CA_1_0.005_validation,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0,...,2.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,FOODS_1_001_CA_1_0.025_evaluation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,FOODS_1_001_CA_1_0.025_validation,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0,...,2.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,FOODS_1_001_CA_1_0.165_evaluation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548815,HOUSEHOLD_2_516_WI_3_0.835_validation,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
548816,HOUSEHOLD_2_516_WI_3_0.975_evaluation,22.0,22.0,22.0,22.0,22.0,22.0,23.0,2.0,2.0,...,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0
548817,HOUSEHOLD_2_516_WI_3_0.975_validation,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
548818,HOUSEHOLD_2_516_WI_3_0.995_evaluation,33.0,33.0,33.0,33.0,34.0,33.0,34.0,3.0,2.0,...,2.0,2.0,2.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0


In [None]:
# Train NGBoost Poisson model and inspect predictive quantiles
ngb = models.NGBoostPoisson().fit(x_train, x_valid, y_train, y_valid)

[iter 0] loss=2.4121 val_loss=2.3352 scale=0.1250 norm=0.1640


In [None]:
# Recursive quantile forecasts
pred_per_q_ngb = utils.forecast_quantile_horizon(ngb, test, features, q_levels)

In [None]:
# Convert to submission file format
utils.save_uncertainty_submission(
    pred_per_q_ngb,
    cols_template=sample_submission_uncertainty,
    filepath=os.path.join(submission_dir, "submission_uncertainty_ngb.csv")
)

True

In [None]:
submission_uncertainty_ngb = pd.read_csv(os.path.join(submission_dir, "submission_uncertainty_ngb.csv"))

submission_uncertainty_ngb

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_0.005_evaluation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,FOODS_1_001_CA_1_0.005_validation,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0,...,2.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,FOODS_1_001_CA_1_0.025_evaluation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,FOODS_1_001_CA_1_0.025_validation,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0,...,2.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,FOODS_1_001_CA_1_0.165_evaluation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548815,HOUSEHOLD_2_516_WI_3_0.835_validation,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
548816,HOUSEHOLD_2_516_WI_3_0.975_evaluation,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0
548817,HOUSEHOLD_2_516_WI_3_0.975_validation,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
548818,HOUSEHOLD_2_516_WI_3_0.995_evaluation,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,5.0,5.0,5.0,5.0,5.0,5.0,6.0,6.0,7.0,7.0
