In [1]:
import pandas as pd

In [3]:
monthly = 'MS'
daily = 'D'

In [58]:
from utils.preprocessing import load_daily_data, load_monthly_data

daily_train, daily_val, daily_test = load_daily_data(use_existing=True)
monthly_train, monthly_val, monthly_test = load_monthly_data(use_existing=True)

daily_train

Unnamed: 0,unique_id,ds,y
0,Austria,2015-01-01,22.34
1,Austria,2015-01-02,22.34
2,Austria,2015-01-03,22.34
3,Austria,2015-01-04,22.34
4,Austria,2015-01-05,36.18
...,...,...,...
3648,Austria,2024-12-27,121.79
3649,Austria,2024-12-28,122.12
3650,Austria,2024-12-29,110.49
3651,Austria,2024-12-30,118.13


### daily

In [59]:
# MACHINE LEARNING:
## Machine Learning Models:
from mlforecast import MLForecast
from sklearn.linear_model import LinearRegression, HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor


models_ml = [
    LinearRegression(),
    HuberRegressor(epsilon=1.35, alpha=1e-3),
    RandomForestRegressor(n_estimators=400, max_depth=20, min_samples_leaf=5, max_features='sqrt', random_state=42),
    LGBMRegressor(objective="regression", boosting_type="gbdt", num_leaves=31, max_depth=-1, learning_rate=0.03, n_estimators=1000, min_child_samples=30, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1, random_state=42)
]

ml_forecast = MLForecast(models=models_ml, freq='D', lags=[356])  ##LAGS? ##DATE FEATURES? ##WINDOWS? 
ml_forecast.fit(daily_train[['unique_id','ds','y']])
fc_val = ml_forecast.predict(h = 356, X_df=daily_val)
df_val_ml = fc_val.merge(daily_val[['unique_id','ds','y']], on=['unique_id','ds'], how='left')
df_val_ml

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 3297, number of used features: 1
[LightGBM] [Info] Start training from score 81.136837


Unnamed: 0,unique_id,ds,LinearRegression,HuberRegressor,RandomForestRegressor,LGBMRegressor,y
0,Austria,2025-01-01,90.882059,66.129095,121.399521,205.096941,84.86
1,Austria,2025-01-02,89.774236,65.052914,76.019394,77.387660,121.51
2,Austria,2025-01-03,82.394611,57.884068,127.556705,152.038649,117.59
3,Austria,2025-01-04,80.864760,56.397913,103.966356,157.109517,126.19
4,Austria,2025-01-05,85.530512,60.930400,226.141138,215.762592,124.96
...,...,...,...,...,...,...,...
351,Austria,2025-12-18,94.609972,69.750529,79.110159,90.577273,135.16
352,Austria,2025-12-19,94.706686,69.844482,84.681183,90.577273,126.37
353,Austria,2025-12-20,91.298226,66.533374,191.753740,205.096941,111.08
354,Austria,2025-12-21,93.537318,68.708513,166.226126,132.385685,106.41


In [None]:
def mergeHolidays_daily(df):
    import holidays
    aut_holidays = holidays.Austria(years=range(2015, 2026))
    
    df['is_holiday'] = df['ds'].isin(aut_holidays.keys()).astype('int8')
    return df

def mergeHolidays_monthly(df, df_daily):
    if 'is_holiday' not in df_daily.columns:
        df_daily = mergeHolidays_daily(df_daily)

    daily_helper = df_daily[['unique_id', 'ds', 'is_holiday']].copy()
    daily_helper['year_month'] = daily_helper['ds'].dt.to_period('M')

    monthly_holidays = (
        daily_helper
        .groupby(['unique_id', 'year_month'], as_index=False)['is_holiday']
        .sum()
        .rename(columns={'is_holiday': 'count_holiday'})
    )

    monthly_holidays['ds'] = monthly_holidays['year_month'].dt.to_timestamp()
    monthly_holidays = monthly_holidays.drop(columns=['year_month'])

    df = df.merge(monthly_holidays, on=['unique_id', 'ds'], how='left')
    df['count_holiday'] = df['count_holiday'].fillna(0).astype('int16')

    return df


In [62]:
daily_train_holidays = mergeHolidays_daily(daily_train)
daily_val_holidays = mergeHolidays_daily(daily_val)
# daily_test_holidays = mergeHolidays_daily(daily_test)
    
# display(daily_val_holidays)

monthly_train_holidays = mergeHolidays_monthly(monthly_train, daily_train)
monthly_val_holidays = mergeHolidays_monthly(monthly_val, daily_val)
# monthly_test_holidays = mergeHolidays_monthly(monthly_test, daily_test)

# display(monthly_train_holidays)

  df['is_holiday'] = df['ds'].isin(aut_holidays.keys()).astype('int8')
  df['is_holiday'] = df['ds'].isin(aut_holidays.keys()).astype('int8')


Unnamed: 0,unique_id,ds,y,is_holiday
0,Austria,2025-01-01,84.86,1
1,Austria,2025-01-02,121.51,0
2,Austria,2025-01-03,117.59,0
3,Austria,2025-01-04,126.19,0
4,Austria,2025-01-05,124.96,0
...,...,...,...,...
351,Austria,2025-12-18,135.16,0
352,Austria,2025-12-19,126.37,0
353,Austria,2025-12-20,111.08,0
354,Austria,2025-12-21,106.41,0


Unnamed: 0,unique_id,ds,y,count_holiday
0,Austria,2015-01-01,29.935161,2
1,Austria,2015-02-01,36.695000,0
2,Austria,2015-03-01,31.297419,0
3,Austria,2015-04-01,29.778333,1
4,Austria,2015-05-01,25.329677,3
...,...,...,...,...
115,Austria,2024-08-01,84.477097,1
116,Austria,2024-09-01,82.400667,0
117,Austria,2024-10-01,85.353548,1
118,Austria,2024-11-01,130.205667,1


In [63]:
## ML with LAGS
from mlforecast.lag_transforms import RollingMean, RollingStd


ml_lags = [1, 7, 28]
data_features = ['dayofweek', 'month', 'quarter']
lag_transforms = {
    # Short-term volatility
    1: [RollingStd(window_size=3)],
    # Weekly trend
    7: [RollingMean(window_size=3)],
    # Monthly smoothing
    28: [RollingMean(window_size=14)],
}
ml_forecast_lags = MLForecast(models=models_ml, freq='D', lags=ml_lags, date_features=data_features)

ml_forecast_lags.fit(df=daily_train, static_features=[])
fc_val = ml_forecast_lags.predict(h = 356, X_df=daily_val)

df_val_ml_lags = fc_val.merge(daily_val[['unique_id','ds','y']], on=['unique_id','ds'], how='left')
df_val_ml_lags

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 792
[LightGBM] [Info] Number of data points in the train set: 3625, number of used features: 7
[LightGBM] [Info] Start training from score 76.709903


Unnamed: 0,unique_id,ds,LinearRegression,HuberRegressor,RandomForestRegressor,LGBMRegressor,y
0,Austria,2025-01-01,110.754521,112.012111,119.622166,111.027133,84.86
1,Austria,2025-01-02,111.937611,111.296547,124.089237,114.987644,121.51
2,Austria,2025-01-03,110.557038,110.056939,130.982597,112.601700,117.59
3,Austria,2025-01-04,104.687090,105.851838,127.018573,105.706346,126.19
4,Austria,2025-01-05,94.261500,98.403926,120.162366,101.464575,124.96
...,...,...,...,...,...,...,...
351,Austria,2025-12-18,100.092907,61.897894,252.355121,223.231113,135.16
352,Austria,2025-12-19,95.169929,58.914483,235.095295,220.744915,126.37
353,Austria,2025-12-20,85.561983,53.125963,225.891317,205.336663,111.08
354,Austria,2025-12-21,71.287130,44.578546,221.093562,193.839842,106.41


## MONTHLY

In [64]:
ml_forecast = MLForecast(models=models_ml, freq='MS', lags=[12])  ##LAGS? ##DATE FEATURES? ##WINDOWS? 
ml_forecast.fit(monthly_train[['unique_id','ds','y']])
fc_val = ml_forecast.predict(h = 12, X_df=monthly_val)
df_val_ml = fc_val.merge(monthly_val[['unique_id','ds','y']], on=['unique_id','ds'], how='left')
df_val_ml

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37
[LightGBM] [Info] Number of data points in the train set: 108, number of used features: 1
[LightGBM] [Info] Start training from score 81.233215


Unnamed: 0,unique_id,ds,LinearRegression,HuberRegressor,RandomForestRegressor,LGBMRegressor,y
0,Austria,2025-01-01,82.626296,57.187356,204.252605,141.40901,134.17871
1,Austria,2025-02-01,78.269654,52.643626,206.544193,162.194929,140.771071
2,Austria,2025-03-01,77.536276,51.878755,206.62174,162.194929,103.604194
3,Austria,2025-04-01,76.390283,50.683549,202.619044,162.194929,81.323667
4,Austria,2025-05-01,77.912735,52.27138,206.62174,162.194929,71.016774
5,Austria,2025-06-01,79.10339,53.513166,211.233088,162.194929,66.672667
6,Austria,2025-07-01,77.766021,52.118365,206.62174,162.194929,87.98
7,Austria,2025-08-01,83.73966,58.348532,130.506461,141.40901,74.744194
8,Austria,2025-09-01,83.14383,57.727114,206.242483,141.40901,92.587667
9,Austria,2025-10-01,83.991157,58.610829,130.506461,141.40901,108.226452


## MONTH WITH LAG

In [65]:
## ML with LAGS
from mlforecast.lag_transforms import RollingMean, RollingStd


ml_lags = [1, 3, 12]
data_features = ['month', 'quarter', 'year']
lag_transforms = {
    # Short-term monthly volatility
    1: [RollingStd(window_size=3)],
    # Quarterly trend
    3: [RollingMean(window_size=2)],
    # Yearly smoothing
    12: [RollingMean(window_size=3)],
}

ml_forecast_lags = MLForecast( models=models_ml, freq='MS', lags=ml_lags, date_features=data_features, lag_transforms=lag_transforms)

ml_forecast_lags.fit(df=monthly_train, static_features=[])

fc_val = ml_forecast_lags.predict(h=12, X_df=monthly_val)

df_val_ml_lags = fc_val.merge(
    monthly_val[['unique_id', 'ds', 'y']],
    on=['unique_id', 'ds'],
    how='left'
)

df_val_ml_lags

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data points in the train set: 106, number of used features: 9
[LightGBM] [Info] Start training from score 82.284558


Unnamed: 0,unique_id,ds,LinearRegression,HuberRegressor,RandomForestRegressor,LGBMRegressor,y
0,Austria,2025-01-01,123.746321,107.804804,133.261543,155.110704,134.17871
1,Austria,2025-02-01,115.618462,103.131897,121.872761,176.434232,140.771071
2,Austria,2025-03-01,124.336797,98.471823,111.040617,176.434232,103.604194
3,Austria,2025-04-01,122.859891,92.607152,106.738656,176.434232,81.323667
4,Austria,2025-05-01,128.071471,88.659191,102.561537,129.980502,71.016774
5,Austria,2025-06-01,135.33304,85.743875,98.028306,196.823206,66.672667
6,Austria,2025-07-01,136.13402,83.732848,100.632357,232.52031,87.98
7,Austria,2025-08-01,139.946471,82.386282,102.371693,223.012518,74.744194
8,Austria,2025-09-01,148.347559,81.830673,102.325115,208.005085,92.587667
9,Austria,2025-10-01,148.815134,82.227338,102.840521,208.005085,108.226452


# DEEP LEARNING


## dl daily

In [None]:
## Deep Learning Models:
from neuralforecast import NeuralForecast
from neuralforecast.models import NBEATS, DeepAR, NLinear, KAN
from neuralforecast.losses.pytorch import MAE

h_val = 365
input_val = 365*4

models_dl = [
    DeepAR(h=h_val, input_size=input_val, lstm_n_layers=1, trajectory_samples=100, loss=MAE(), valid_loss=MAE(), learning_rate=0.005, max_steps=200, scaler_type='standard', enable_progress_bar=True), 
    NLinear(h=h_val, input_size=input_val, loss=MAE(), scaler_type='robust', learning_rate=1e-3, max_steps=500), 
    NBEATS(h=h_val, input_size=input_val, basis='changepoint', n_basis=2, loss=MAE(), stack_types=['identity', 'trend','seasonality'], max_steps=100),
    KAN(h=h_val, input_size=input_val, loss=MAE(), scaler_type='robust',learning_rate=1e-3,max_steps=500)
]

dl_forecast = NeuralForecast(models=models_dl, freq=daily) ##LAGS? ##Date Features??

2025-12-22 17:59:27,319	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-12-22 17:59:31,828	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Seed set to 1
Seed set to 1
Seed set to 1
Seed set to 1
Seed set to 1


### daily with lag

## monthly

### monthly with lag