## Libraries

In [1]:
%pip install -qqq mlforecast

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path

import lightgbm as lgb
import mlforecast
import numpy as np
import polars as pl
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean, SeasonalRollingMean

In [3]:
mlforecast.__version__

'1.0.1'

In [4]:
pl.enable_string_cache()

## Data setup

In [5]:
input_path = Path('m5-forecasting-accuracy/')

### Calendar

In [6]:
cal_dtypes = {
    'date': pl.Datetime,
    'd': pl.Categorical,
    'wm_yr_wk': pl.Int32,
    'event_name_1': pl.Categorical,
    'event_type_1': pl.Categorical,
    'event_name_2': pl.Categorical,
    'event_type_2': pl.Categorical,
    'snap_CA': pl.Int32,
    'snap_TX': pl.Int32,
    'snap_WI': pl.Int32,
}
cal = pl.read_csv(input_path / 'calendar.csv', schema_overrides=cal_dtypes, columns=list(cal_dtypes.keys()))
event_cols = [k for k in cal_dtypes if k.startswith('event')]
cal = cal.with_columns(pl.col(event_cols).fill_null('nan'))

### Prices

In [7]:
prices_dtypes = {
    'store_id': pl.Categorical,
    'item_id': pl.Categorical,
    'wm_yr_wk': pl.Int32,
    'sell_price': pl.Float32,
}
prices = pl.read_csv(input_path / 'sell_prices.csv', schema_overrides=prices_dtypes)

### Sales

In [8]:
sales_dtypes = {
    'id': pl.Categorical,
    'item_id': pl.Categorical,
    'dept_id': pl.Categorical,
    'cat_id': pl.Categorical,
    'store_id': pl.Categorical,
    'state_id': pl.Categorical,
    **{f'd_{i}': pl.Float32 for i in range(1942)}
}
sales = pl.read_csv(input_path / 'sales_train_evaluation.csv', schema_overrides=sales_dtypes)

In [9]:
import polars as pl

id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
value_vars = [col for col in sales.columns if col.startswith("d_")]

long = (
    sales.lazy()
    .melt(id_vars=id_vars, value_vars=value_vars, variable_name="d", value_name="y")
    .collect()
)


  .melt(id_vars=id_vars, value_vars=value_vars, variable_name="d", value_name="y")


In [10]:
%%time
print(long.shape[0])
long = long.with_columns(pl.col('d').cast(pl.Categorical))
long = long.join(cal, on=['d'])
dates = sorted(long['date'].unique())
long = long.sort(['id', 'date'])
without_leading_zeros = pl.col('y').gt(0).cast(pl.Int64).cum_max().over('id').cast(pl.Boolean)
above_min_date = pl.col('date') >= dates[-400]
keep_mask = without_leading_zeros & above_min_date
long = long.filter(keep_mask)
print(long.shape[0])

59181090
12159132
CPU times: total: 22.8 s
Wall time: 2.76 s


In [11]:
long = long.join(prices, on=['store_id', 'item_id', 'wm_yr_wk'])


In [12]:
# Get max date from long (original train + prices merged)
last_date_all = long['date'].max()
valid_horizon = 28
valid_start_date = last_date_all - pl.duration(days=valid_horizon)
train_end = valid_start_date - pl.duration(days=valid_horizon)

# Split long into train and validation (just like time-based CV)
train_long = long.filter(pl.col("date") < valid_start_date)
valid_long = long.filter(pl.col("date") >= valid_start_date)

# Build calendar + prices (X_df) for the validation period only
future_cal = cal.filter((pl.col('date') >= valid_start_date) & (pl.col('date') <= last_date_all))
future_prices = prices.filter(pl.col('wm_yr_wk').is_in(future_cal['wm_yr_wk'].unique()))

# Create IDs to match training IDs
future_prices = future_prices.with_columns(
    id = (pl.col('item_id') + '_' + pl.col('store_id') + '_evaluation')
)

X_df = future_prices.join(future_cal, on='wm_yr_wk')
X_df = X_df.drop(['store_id', 'item_id', 'wm_yr_wk', 'd'])
X_df = X_df.with_columns(pl.col('id').cast(pl.Categorical))


Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  future_prices = prices.filter(pl.col('wm_yr_wk').is_in(future_cal['wm_yr_wk'].unique()))


In [13]:
for h in range(0, 28 + 1):
    train_long = train_long.with_columns(
        pl.col('y').shift(-h).over('id').alias(f'y_t+{h}')
    )

In [14]:
df_clean = train_long.drop_nulls()

In [15]:
df_clean = df_clean.with_columns(
    pl.arange(0, df_clean.height).alias("row_number")
)

In [16]:
y_cols = [f'y_t+{h}' for h in range(0, 28 + 1)]
X_train = df_clean.drop(y_cols + ['date', 'id'])
y_train = df_clean

## Training

Since at the time of making this LightGBM can't handle polars dataframes with categorical features we'll build the features as numpy arrays as described [here](https://nixtla.github.io/mlforecast/docs/how-to-guides/training_with_numpy.html#preprocess-method).

In [17]:
fcst = MLForecast(
    models=[],
    freq='1d',
    lags=[7 * (i+1) for i in range(8)],
    lag_transforms = {
        1 :  [ExpandingMean()],
        7 :  [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        14:  [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        28:  [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
    },
    date_features=['year', 'month', 'day', 'weekday', 'quarter', 'week'],    
    num_threads=4,
)

In [18]:
long["event_name_1"].value_counts()

event_name_1,count
cat,u32
"""EidAlAdha""",30404
"""Easter""",30490
"""MartinLutherKingDay""",30477
"""OrthodoxChristmas""",30474
"""nan""",11186101
…,…
"""Thanksgiving""",30462
"""LentWeek2""",30490
"""NBAFinalsEnd""",30325
"""ColumbusDay""",30429


In [50]:
def preprocess_multi_output(
    fcst,
    df,
    target_col='y',
    categoricals=None,
    horizon=28
):
    """
    Extends mlforecast.preprocess to support multi-output (multi-horizon) targets.
    """
    import polars as pl
    import numpy as np

    # Step 1: Run standard fcst.preprocess to get feature DataFrame
    X_df = fcst.preprocess(
        df,
        id_col='id',
        time_col='date',
        target_col=target_col,
        static_features=categoricals or [],
        return_X_y=False,
        as_numpy=False
    )

    # Step 2: Add shifted target columns for each horizon step
    for h in range(1, horizon + 1):
        X_df = X_df.with_columns(
            pl.col(target_col).shift(-h).over('id').alias(f'y_t+{h}')
        )

    # Step 3: Drop rows with any missing targets
    y_cols = [f'y_t+{i}' for i in range(1, horizon + 1)]
    X_df = X_df.drop_nulls(subset=y_cols)

    # Step 4: Prepare X and y
    X = X_df.drop(['id', 'date', target_col] + y_cols).to_numpy()
    y = X_df.select(y_cols).to_numpy()

    return X, y


In [51]:
X_train, y_train = preprocess_multi_output(
    fcst=fcst,
    df=train_long.drop(["d", "wm_yr_wk"]),
    target_col='y',
    categoricals=categoricals,
    horizon=28
)

In [53]:
%%time
categoricals = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
X_train, y = fcst.preprocess(
    train_long.drop(["d","wm_yr_wk"]),
    id_col='id',
    time_col='date',
    target_col='y',
    static_features=categoricals,
    return_X_y=True,    
    as_numpy=True,
)

CPU times: total: 31.2 s
Wall time: 2.79 s


In [40]:
%%time
categoricals = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
X_train, y = fcst.preprocess(
    train_long.drop(["d","wm_yr_wk"]),
    id_col='id',
    time_col='date',
    target_col='y',
    static_features=categoricals,
    return_X_y=True,    
    as_numpy=True,
)

CPU times: total: 33.1 s
Wall time: 2.69 s


In [41]:
X_train

array([[5.0630e+03, 2.1000e+03, 5.1580e+03, ..., 7.0000e+00, 2.0000e+00,
        2.4000e+01],
       [5.0630e+03, 2.1000e+03, 5.1580e+03, ..., 1.0000e+00, 2.0000e+00,
        2.5000e+01],
       [5.0630e+03, 2.1000e+03, 5.1580e+03, ..., 2.0000e+00, 2.0000e+00,
        2.5000e+01],
       ...,
       [3.5565e+04, 5.0000e+03, 7.2620e+03, ..., 4.0000e+00, 2.0000e+00,
        1.6000e+01],
       [3.5565e+04, 5.0000e+03, 7.2620e+03, ..., 5.0000e+00, 2.0000e+00,
        1.6000e+01],
       [3.5565e+04, 5.0000e+03, 7.2620e+03, ..., 6.0000e+00, 2.0000e+00,
        1.6000e+01]])

In [31]:
X_train = X_train.drop_nulls()

In [32]:
y_train = X_train.select(y_cols)
X_train = X_train.drop(y_cols)


In [34]:
from sklearn.preprocessing import LabelEncoder
import polars as pl

categoricals = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
df_encoded = X_train.clone()

for col in categoricals:
    le = LabelEncoder()
    df_encoded = df_encoded.with_columns(
        pl.Series(col, le.fit_transform(df_encoded[col].to_list()))
    )


In [35]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.base import clone
from joblib import Parallel, delayed

class MultiOutputLGBMRegressor(MultiOutputRegressor):
    def __init__(self, estimator, n_jobs=None, feature_name=None, categorical_feature=None):
        super().__init__(estimator, n_jobs=n_jobs)
        self.feature_name = feature_name
        self.categorical_feature = categorical_feature

    def fit(self, X, Y, **fit_params):
        if len(Y.shape) != 2:
            raise ValueError("Y must be 2D for multi-output regression.")
        
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(self._fit_single)(
                i, X, Y[:, i]
            ) for i in range(Y.shape[1])
        )
        return self

    def _fit_single(self, i, X, y):
        estimator = clone(self.estimator)
        estimator.fit(
            X, y,
            feature_name=self.feature_name,
            categorical_feature=self.categorical_feature
        )
        return estimator

    def predict(self, X):
        return super().predict(X)


In [39]:
import lightgbm as lgb

model_params = {
    'verbose': -1,
    'force_col_wise': True,
    'num_leaves': 256,
    'n_estimators': 50,
}

base_model = lgb.LGBMRegressor(**model_params)

multi_model = MultiOutputLGBMRegressor(
    estimator=base_model,
    feature_name=fcst.ts.features_order_,
    categorical_feature=categoricals,
    n_jobs=-1
)

%time multi_model.fit(df_encoded, y_train)


ValueError: could not convert string to float: 'NBAFinalsEnd'

In [105]:
from sklearn.multioutput import MultiOutputRegressor
import lightgbm as lgb

model_params = {
    'verbose': -1,
    'force_col_wise': True,
    'num_leaves': 256,
    'n_estimators': 50,
}

base_model = lgb.LGBMRegressor(**model_params)

multi_model = MultiOutputRegressor(base_model)

# Just fit with X and y (no feature_name/categorical awareness)
%time multi_model.fit(X_train, y_train)


ValueError: could not convert string to float: 'HOBBIES_1_001_CA_1_evaluation'

We'll manually train the model here, which allows us to specify which features should be treated as categorical.

In [215]:
model_params = {
    'verbose': -1,
    'force_col_wise': True,
    'num_leaves': 256,
    'n_estimators': 50,
}
model = lgb.LGBMRegressor(**model_params)
%time model.fit(X, y, feature_name=fcst.ts.features_order_, categorical_feature=categoricals)

CPU times: total: 4min 8s
Wall time: 18.1 s


In [220]:
fcst.models_ = {'LGBMRegressor': model}

In [221]:
fcst.models_["LGBMRegressor"].feature_name_


['id',
 'item_id',
 'dept_id',
 'cat_id',
 'store_id',
 'state_id',
 'event_name_1',
 'event_type_1',
 'event_name_2',
 'event_type_2',
 'snap_CA',
 'snap_TX',
 'snap_WI',
 'sell_price',
 'lag7',
 'lag14',
 'lag21',
 'lag28',
 'lag35',
 'lag42',
 'lag49',
 'lag56',
 'expanding_mean_lag1',
 'rolling_mean_lag7_window_size7',
 'rolling_mean_lag7_window_size14',
 'rolling_mean_lag7_window_size28',
 'seasonal_rolling_mean_lag7_season_length7_window_size4',
 'rolling_mean_lag14_window_size7',
 'rolling_mean_lag14_window_size14',
 'rolling_mean_lag14_window_size28',
 'seasonal_rolling_mean_lag14_season_length7_window_size4',
 'rolling_mean_lag28_window_size7',
 'rolling_mean_lag28_window_size14',
 'rolling_mean_lag28_window_size28',
 'seasonal_rolling_mean_lag28_season_length7_window_size4',
 'year',
 'month',
 'day',
 'weekday',
 'quarter',
 'week']

## Forecasting

We now override the `models_` attribute to generate predictions, as described [here](https://nixtla.github.io/mlforecast/docs/how-to-guides/custom_training.html#custom-training).

In [161]:
from datetime import datetime, timedelta


In [191]:
X_df

id,date,y,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
cat,datetime[μs],f32,cat,cat,cat,cat,i32,i32,i32,f32
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-26 00:00:00,0.0,"""nan""","""nan""","""nan""","""nan""",0,0,0,8.38
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-27 00:00:00,0.0,"""nan""","""nan""","""nan""","""nan""",0,0,0,8.38
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-28 00:00:00,2.0,"""nan""","""nan""","""nan""","""nan""",0,0,0,8.38
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-29 00:00:00,0.0,"""nan""","""nan""","""nan""","""nan""",0,0,0,8.38
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-30 00:00:00,3.0,"""Pesach End""","""Religious""","""nan""","""nan""",0,0,0,8.38
…,…,…,…,…,…,…,…,…,…,…
"""FOODS_3_677_WI_3_evaluation""",2016-05-20 00:00:00,0.0,"""nan""","""nan""","""nan""","""nan""",0,0,0,3.98
"""FOODS_3_677_WI_3_evaluation""",2016-05-21 00:00:00,0.0,"""nan""","""nan""","""nan""","""nan""",0,0,0,3.98
"""FOODS_3_677_WI_3_evaluation""",2016-05-22 00:00:00,1.0,"""nan""","""nan""","""nan""","""nan""",0,0,0,3.98
"""FOODS_3_677_WI_3_evaluation""",2016-05-23 00:00:00,,,,,,,,,


In [232]:
X_df = valid_long.drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'] + ["y", "d", "wm_yr_wk"])

fcst.predict(h=28, X_df=X_df)

id,date,LGBMRegressor
cat,datetime[μs],f64
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-25 00:00:00,0.750019
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-26 00:00:00,0.763703
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-27 00:00:00,0.712938
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-28 00:00:00,0.825271
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-29 00:00:00,0.822125
…,…,…
"""FOODS_3_677_WI_3_evaluation""",2016-05-18 00:00:00,0.235031
"""FOODS_3_677_WI_3_evaluation""",2016-05-19 00:00:00,0.228713
"""FOODS_3_677_WI_3_evaluation""",2016-05-20 00:00:00,0.260275
"""FOODS_3_677_WI_3_evaluation""",2016-05-21 00:00:00,0.280479


In [235]:
fcst.predict(h=29, X_df=X_df)



id,date,LGBMRegressor
cat,datetime[μs],f64
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-25 00:00:00,0.750019
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-26 00:00:00,0.763703
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-27 00:00:00,0.712938
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-28 00:00:00,0.825271
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-29 00:00:00,0.822125
…,…,…
"""FOODS_3_677_WI_3_evaluation""",2016-05-19 00:00:00,0.228713
"""FOODS_3_677_WI_3_evaluation""",2016-05-20 00:00:00,0.260275
"""FOODS_3_677_WI_3_evaluation""",2016-05-21 00:00:00,0.280479
"""FOODS_3_677_WI_3_evaluation""",2016-05-22 00:00:00,0.283117


In [231]:
static.null_count()

id,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0


In [259]:
from datetime import timedelta
import polars as pl

offset = 1
horizon = 28-offset
forecast_start = valid_start_date + timedelta(days=offset)
forecast_end = forecast_start + timedelta(days=horizon)

extended_long = long.filter(pl.col("date") <= forecast_start)

_ = fcst.preprocess(
    extended_long.drop(["d", "wm_yr_wk"]),
    id_col="id",
    time_col="date",
    target_col="y",
    static_features=categoricals,
    return_X_y=True,    
    as_numpy=True,
)

X_df = fcst.make_future_dataframe(h=horizon)
X_df = X_df.filter(pl.col("date") >= forecast_start)

print(min(fcst.make_future_dataframe(h=horizon)['date']))

static = (
    long
    .filter(pl.col("date") >= valid_start_date)
    .drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'] + ["y", "d", "wm_yr_wk"])
)
X_df = X_df.join(static, on=["id", "date"], how="left").drop()

new_df = (
    long.drop(["d", "wm_yr_wk"])
        .filter((pl.col("date") >= forecast_start) & (pl.col("date") <= forecast_end))
)

fcst.models_ = {"LGBMRegressor": model}  # Only if not already set
preds = fcst.predict(h=horizon, X_df=X_df)

joined = preds.join(valid_long.select(["date", "id", "y"]), on=["id", "date"], how="inner")

# 2. Compute squared error
joined = joined.with_columns([
    (pl.col("y") - pl.col("LGBMRegressor")).pow(2).alias("squared_error")
])

# 3. Compute RMSE
rmse = (joined["squared_error"].mean()) ** 0.5

print(f"RMSE: {rmse:.10f}")

2016-04-26 00:00:00
RMSE: 2.1166683581


In [239]:
X_df.null_count()

id,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,30490,30490,30490,30490,30490,30490,30490,30490


In [189]:
static

y,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
f32,datetime[μs],cat,cat,cat,cat,i32,i32,i32,f32
1.0,2016-04-24 00:00:00,"""nan""","""nan""","""nan""","""nan""",0,0,0,8.38
0.0,2016-04-25 00:00:00,"""nan""","""nan""","""nan""","""nan""",0,0,0,8.38
0.0,2016-04-26 00:00:00,"""nan""","""nan""","""nan""","""nan""",0,0,0,8.38
0.0,2016-04-27 00:00:00,"""nan""","""nan""","""nan""","""nan""",0,0,0,8.38
2.0,2016-04-28 00:00:00,"""nan""","""nan""","""nan""","""nan""",0,0,0,8.38
…,…,…,…,…,…,…,…,…,…
0.0,2016-05-18 00:00:00,"""nan""","""nan""","""nan""","""nan""",0,0,0,3.98
0.0,2016-05-19 00:00:00,"""nan""","""nan""","""nan""","""nan""",0,0,0,3.98
0.0,2016-05-20 00:00:00,"""nan""","""nan""","""nan""","""nan""",0,0,0,3.98
0.0,2016-05-21 00:00:00,"""nan""","""nan""","""nan""","""nan""",0,0,0,3.98


In [186]:
X_df

id,date,sell_price
cat,datetime[μs],f32
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-26 00:00:00,8.38
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-27 00:00:00,8.38
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-28 00:00:00,8.38
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-29 00:00:00,8.38
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-30 00:00:00,8.38
…,…,…
"""FOODS_3_677_WI_3_evaluation""",2016-05-20 00:00:00,3.98
"""FOODS_3_677_WI_3_evaluation""",2016-05-21 00:00:00,3.98
"""FOODS_3_677_WI_3_evaluation""",2016-05-22 00:00:00,3.98
"""FOODS_3_677_WI_3_evaluation""",2016-05-23 00:00:00,


In [174]:
print(fcst.make_future_dataframe(h=horizon))


shape: (884_210, 2)
┌───────────────────────────────┬─────────────────────┐
│ id                            ┆ date                │
│ ---                           ┆ ---                 │
│ cat                           ┆ datetime[μs]        │
╞═══════════════════════════════╪═════════════════════╡
│ HOBBIES_1_001_CA_1_evaluation ┆ 2016-04-26 00:00:00 │
│ HOBBIES_1_001_CA_1_evaluation ┆ 2016-04-27 00:00:00 │
│ HOBBIES_1_001_CA_1_evaluation ┆ 2016-04-28 00:00:00 │
│ HOBBIES_1_001_CA_1_evaluation ┆ 2016-04-29 00:00:00 │
│ HOBBIES_1_001_CA_1_evaluation ┆ 2016-04-30 00:00:00 │
│ …                             ┆ …                   │
│ FOODS_3_677_WI_3_evaluation   ┆ 2016-05-20 00:00:00 │
│ FOODS_3_677_WI_3_evaluation   ┆ 2016-05-21 00:00:00 │
│ FOODS_3_677_WI_3_evaluation   ┆ 2016-05-22 00:00:00 │
│ FOODS_3_677_WI_3_evaluation   ┆ 2016-05-23 00:00:00 │
│ FOODS_3_677_WI_3_evaluation   ┆ 2016-05-24 00:00:00 │
└───────────────────────────────┴─────────────────────┘


In [165]:
from datetime import timedelta
import polars as pl

# Define forecast horizon
horizon = 29
forecast_start = valid_start_date + timedelta(days=1)
forecast_end = forecast_start + timedelta(days=horizon - 1)

# Step 1: Generate valid X_df for future predictions
X_df = fcst.make_future_dataframe(h=horizon)

# Step 1b (optional): Add static features (if used in training)
static_cols = ["id", "item_id", "store_id", "state_id", "dept_id", "cat_id"]
static = long.select(["date","id","sell_price"])

static = (
    long
    .filter(pl.col("date") == valid_start_date)
    .select(["id", "sell_price"])
    .unique("id")
)
X_df = X_df.join(static, on="id", how="left")

X_df = fcst.make_future_dataframe(h=horizon)

# Step 2: Create new_df with dynamic features for forecast window
new_df = (
    long.drop(["d", "wm_yr_wk"])
        .filter((pl.col("date") >= forecast_start) & (pl.col("date") <= forecast_end))
)

print(len(fcst.get_missing_future(horizon, X_df)))

# Step 3: Fill missing values for features used during training
# You MUST include every column that was used during training
new_df = new_df.with_columns([
    pl.col("event_name_1").fill_null("none"),
    pl.col("event_type_1").fill_null("none"),
    pl.col("sell_price").fill_null(0),  # Adjust as needed
    # Add others if your model used more columns
])

# Step 4: Predict
fcst.models_ = {"LGBMRegressor": model}  # if not already set
preds = fcst.predict(h=horizon, X_df=X_df)

# Optional: display or export preds
print(preds.head())


0


ValueError: Found no exogenous features in `X_df`.

In [138]:
fcst.get_missing_future(29, X_df)

id,date
cat,datetime[μs]


In [137]:
X_df = long.drop(["d","wm_yr_wk","item_id","store_id","state_id","dept_id","cat_id","y"])

new = long.drop(["d","wm_yr_wk" ]).filter(pl.col('date') <= valid_start_date + timedelta(days=1))

fcst.models_ = {'LGBMRegressor': model}
%time preds = fcst.predict(h=29, new_df=new,X_df=X_df )


ValueError: Found missing inputs in X_df. It should have one row per id and time for the complete forecasting horizon.
You can get the expected structure by running `MLForecast.make_future_dataframe(h)` or get the missing combinatins in your current `X_df` by running `MLForecast.get_missing_future(h, X_df)`.

In [19]:
preds.join(valid_long["id","date","y"],how="inner",on=["id","date"])

NameError: name 'preds' is not defined

In [None]:
preds.join(eval_long)

id,date,LGBMRegressor
cat,datetime[μs],f64
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-24 00:00:00,1.004026
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-25 00:00:00,0.725626
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-26 00:00:00,0.733981
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-27 00:00:00,0.694137
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-28 00:00:00,0.857808
…,…,…
"""FOODS_3_583_WI_3_evaluation""",2016-05-17 00:00:00,0.841036
"""FOODS_3_583_WI_3_evaluation""",2016-05-18 00:00:00,0.908861
"""FOODS_3_583_WI_3_evaluation""",2016-05-19 00:00:00,0.835177
"""FOODS_3_583_WI_3_evaluation""",2016-05-20 00:00:00,0.890161


In [None]:
from utilsforecast.evaluation import evaluate
from utilsforecast.losses import mse, mae, rmse, mape

# Join predictions and ground truth
df_eval = preds.join(
    valid_long.select(["id", "date", "y"]),
    how="inner",
    on=["id", "date"]
).rename({"id": "unique_id","date":"ds"})  # required by utilsforecast

# Evaluate using Nixtla's metrics
metrics = evaluate(
    df_eval,
    metrics=[rmse, mae, mse, mape]
)

print(metrics)


shape: (121_960, 3)
┌───────────────────────────────┬────────┬───────────────┐
│ unique_id                     ┆ metric ┆ LGBMRegressor │
│ ---                           ┆ ---    ┆ ---           │
│ cat                           ┆ str    ┆ f64           │
╞═══════════════════════════════╪════════╪═══════════════╡
│ HOBBIES_1_001_CA_1_evaluation ┆ rmse   ┆ 1.365652      │
│ HOBBIES_1_002_CA_1_evaluation ┆ rmse   ┆ 0.506787      │
│ HOBBIES_1_003_CA_1_evaluation ┆ rmse   ┆ 0.933877      │
│ HOBBIES_1_004_CA_1_evaluation ┆ rmse   ┆ 1.562539      │
│ HOBBIES_1_005_CA_1_evaluation ┆ rmse   ┆ 1.217481      │
│ …                             ┆ …      ┆ …             │
│ FOODS_3_579_WI_3_evaluation   ┆ mape   ┆ 0.681225      │
│ FOODS_3_580_WI_3_evaluation   ┆ mape   ┆ 0.794133      │
│ FOODS_3_581_WI_3_evaluation   ┆ mape   ┆ 0.672644      │
│ FOODS_3_582_WI_3_evaluation   ┆ mape   ┆ 0.608619      │
│ FOODS_3_583_WI_3_evaluation   ┆ mape   ┆ 0.232426      │
└───────────────────────────────┴───

In [None]:
metrics.group_by("metric").agg([
    pl.col("LGBMRegressor").mean().alias("mean"),
    pl.col("LGBMRegressor").std().alias("std"),
    pl.col("LGBMRegressor").min().alias("min"),
    pl.col("LGBMRegressor").max().alias("max"),
    pl.col("LGBMRegressor").median().alias("median"),
])


metric,mean,std,min,max,median
str,f64,f64,f64,f64,f64
"""mse""",4.444617,26.503637,0.016103,1565.380037,0.834652
"""mape""",0.603382,0.242278,0.036308,17.283863,0.588092
"""mae""",1.045478,1.249883,0.124908,30.777167,0.722405
"""rmse""",1.344253,1.624095,0.1269,39.564884,0.913593


In [None]:
df_eval

unique_id,date,LGBMRegressor,y
cat,datetime[μs],f64,f32
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-24 00:00:00,1.004026,1.0
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-25 00:00:00,0.725626,0.0
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-26 00:00:00,0.733981,0.0
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-27 00:00:00,0.694137,0.0
"""HOBBIES_1_001_CA_1_evaluation""",2016-04-28 00:00:00,0.857808,2.0
…,…,…,…
"""FOODS_3_583_WI_3_evaluation""",2016-05-17 00:00:00,0.841036,0.0
"""FOODS_3_583_WI_3_evaluation""",2016-05-18 00:00:00,0.908861,0.0
"""FOODS_3_583_WI_3_evaluation""",2016-05-19 00:00:00,0.835177,1.0
"""FOODS_3_583_WI_3_evaluation""",2016-05-20 00:00:00,0.890161,2.0


In [None]:
rmse

2.1082259474118947

## Submission

In [None]:
wide = preds.pivot(values='LGBMRegressor', index='id', on='date')
wide.columns = ['id'] + [f'F{i+1}' for i in range(28)]
wide = wide.with_columns(pl.col('id').cast(pl.Utf8))
wide

id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""HOBBIES_1_001_CA_1_evaluation""",0.833897,0.840057,0.797003,0.981697,1.001909,1.354826,1.353887,0.81619,0.921808,0.951556,0.967182,1.01913,1.361474,1.169812,0.863993,0.881578,0.860041,0.922818,0.996496,1.349537,1.227884,0.80713,0.778652,0.80238,0.89425,0.927711,1.159154,1.094949
"""HOBBIES_1_002_CA_1_evaluation""",0.301608,0.337076,0.30109,0.29595,0.351041,0.404711,0.380121,0.224811,0.2927,0.307221,0.290396,0.361029,0.403829,0.398394,0.328379,0.325858,0.296514,0.300414,0.358873,0.405415,0.415314,0.328379,0.321958,0.3319,0.328,0.377666,0.433479,0.392115
"""HOBBIES_1_003_CA_1_evaluation""",0.525617,0.507423,0.523173,0.575457,0.773517,0.806967,0.852567,0.534406,0.596685,0.653209,0.601299,0.727181,0.842056,0.906656,0.570054,0.582903,0.600682,0.627493,0.716215,0.792254,0.843897,0.553248,0.562246,0.550727,0.592842,0.717597,0.769499,0.806967
"""HOBBIES_1_004_CA_1_evaluation""",1.669314,1.408619,1.46908,1.55035,1.562228,2.026032,2.472441,1.166942,1.599797,1.691126,1.662757,1.92003,2.279371,2.510155,1.762801,1.578394,1.597008,1.621594,1.846485,2.145817,2.458724,1.855348,1.596231,1.636001,1.588729,1.724641,2.110413,2.31892
"""HOBBIES_1_005_CA_1_evaluation""",1.243967,1.175488,1.119334,1.101344,1.186044,1.419581,1.408517,0.953076,1.022751,1.09324,1.108721,1.256097,1.40743,1.544462,1.133089,1.114874,1.114592,1.161211,1.190928,1.363148,1.458089,1.107447,1.066974,1.027533,1.004561,1.1528,1.285675,1.333709
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""FOODS_3_579_WI_3_evaluation""",0.240898,0.256534,0.23278,0.215711,0.227391,0.241789,0.241789,0.159824,0.215166,0.257338,0.242633,0.303502,0.266146,0.34995,0.328606,0.250371,0.230068,0.308539,0.289797,0.349088,0.349088,0.298956,0.271631,0.292644,0.242351,0.246009,0.264338,0.258524
"""FOODS_3_580_WI_3_evaluation""",6.80407,5.391587,4.988874,5.805819,5.531085,6.623079,9.018246,6.10223,4.628618,9.278724,11.11929,9.949909,8.079201,13.477679,10.683496,8.373447,9.313737,11.042063,6.592893,12.691774,10.939248,7.593142,9.59427,10.841804,5.633552,6.039898,7.073972,7.313361
"""FOODS_3_581_WI_3_evaluation""",0.220488,0.20723,0.20723,0.21237,0.230425,0.238449,0.205814,0.135448,0.185589,0.265597,0.234152,0.300215,0.257665,0.341469,0.308196,0.248857,0.237482,0.283318,0.268566,0.340607,0.324712,0.290475,0.254222,0.300058,0.217975,0.253423,0.255857,0.234148
"""FOODS_3_582_WI_3_evaluation""",2.700242,2.632899,2.294907,2.426135,2.789663,3.302588,3.811948,2.499263,2.589231,2.481936,2.678291,3.274891,3.226891,4.092828,2.964795,2.805928,2.488012,2.94113,2.928715,3.645687,4.292225,2.639021,2.853514,2.595108,2.54386,2.783921,3.286068,3.22004


In [None]:
subm = pl.concat([wide, wide.with_columns(pl.col('id').str.replace('evaluation', 'validation'))])
subm.write_csv('submission.csv')