In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import pandas as pd
from src.features.future_features import build_future_features
from src.utils.timeseries_split import (
    compute_min_hist, rolling_time_series_cv, select_by_index
)
from src.features.build_features import build_features
from src.pipeline.per_customer import per_customer_cv
import numpy as np
from src.data.preprocess import preprocess_all_customers
from src.data.loader import load_raw, reindex_daily
from src.eval.run_baselines import run_baselines_per_customer
from src.eval.run_candidates import run_candidates_per_customer

In [7]:
df = load_raw("data/raw/train set.csv") 

In [8]:
df_clean, summary = preprocess_all_customers(
    df,
    long_gap_days=30,
    min_nonzero_run=5,
    min_nonzero_value=1.0,
    gap_limit=7,
    causal=False,
    verbose=True,
)

print("Cleaned dataset shape:", df_clean.shape)
print(summary[["CUSTOMER", "inactive_lead_days", "clean_start", "clean_end"]])

[clean_and_truncate_series] {'active': True, 'orig_len': 1402, 'clean_len': 1041, 'active_days': 1041, 'orig_start': Timestamp('2019-08-01 00:00:00'), 'orig_end': Timestamp('2023-06-02 00:00:00'), 'clean_start': Timestamp('2020-07-27 00:00:00'), 'clean_end': Timestamp('2023-06-02 00:00:00'), 'inactive_lead_days': 361, 'inactive_tail_days': 11, 'gap_limit': 7, 'causal': False, 'min_nonzero_run': 5, 'min_nonzero_value': 1.0}
[clean_and_truncate_series] {'active': True, 'orig_len': 1485, 'clean_len': 1272, 'active_days': 1272, 'orig_start': Timestamp('2019-05-10 00:00:00'), 'orig_end': Timestamp('2023-06-02 00:00:00'), 'clean_start': Timestamp('2019-12-09 00:00:00'), 'clean_end': Timestamp('2023-06-02 00:00:00'), 'inactive_lead_days': 213, 'inactive_tail_days': 11, 'gap_limit': 7, 'causal': False, 'min_nonzero_run': 5, 'min_nonzero_value': 1.0}
[clean_and_truncate_series] {'active': True, 'orig_len': 2072, 'clean_len': 2072, 'active_days': 2072, 'orig_start': Timestamp('2017-09-30 00:00:0

In [9]:
# CV Config
HORIZON_DAYS = 25
STEP_DAYS    = 7
N_FOLDS      = 5        
WINDOW_TYPE  = "expanding"

# Features Config
MAX_LAG      = 30
ROLL_WINDOWS = [7, 14, 30]
MIN_HIST     = compute_min_hist(MAX_LAG, ROLL_WINDOWS)

# Holiday Config
HOLIDAY_COUNTRY = "FR"
HOLIDAY_SUBDIV_MAP = None
HOLIDAY_WINDOW = 3

In [10]:
per_fold, summary = run_baselines_per_customer(
    df_clean,
    n_folds=N_FOLDS,
    window_type=WINDOW_TYPE,
    step_days=STEP_DAYS,
    horizon_days=HORIZON_DAYS,
    gap_days=0,
    max_lag=MAX_LAG,
    roll_windows=ROLL_WINDOWS,
    holiday_country=HOLIDAY_COUNTRY,
    holiday_subdiv_map=HOLIDAY_SUBDIV_MAP,
    holiday_window=HOLIDAY_WINDOW,
    trim_by_history=True,
    dropna_mode="none",
    out_dir="outputs/cv",
    save_csv=True,
)

display(per_fold.head(12))
display(summary)



Unnamed: 0,CUSTOMER,fold,anchor,model,MAE,RMSE,sMAPE,n
0,ARGALYS,1,2020-10-25,ETS-add,7.062005,9.619104,57.254531,25
1,ARGALYS,2,2020-11-01,ETS-add,9.150523,12.357691,60.717651,25
2,ARGALYS,3,2020-11-08,ETS-add,11.6581,15.168005,68.272525,25
3,ARGALYS,4,2020-11-15,ETS-add,10.827963,14.136137,57.958115,25
4,ARGALYS,5,2020-11-22,ETS-add,15.431875,17.725259,63.118431,25
5,ARGALYS,1,2020-10-25,Naive-1,6.48,7.793159,64.499231,25
6,ARGALYS,2,2020-11-01,Naive-1,6.96,8.557648,54.432514,25
7,ARGALYS,3,2020-11-08,Naive-1,8.04,9.590794,49.735365,25
8,ARGALYS,4,2020-11-15,Naive-1,9.98,12.075595,55.306101,25
9,ARGALYS,5,2020-11-22,Naive-1,12.24,14.502529,65.031821,25


Unnamed: 0,CUSTOMER,model,MAE,RMSE,sMAPE
0,ARGALYS,Naive-1,8.74,10.503945,57.801006
1,ARGALYS,ETS-add,10.826093,13.801239,61.46425
2,ARGALYS,Seasonal-7,12.46,16.180457,102.551615
3,LES MIRACULEUX,Naive-1,57.472,77.016345,48.678259
4,LES MIRACULEUX,ETS-mul,75.095238,97.881686,77.007965
5,LES MIRACULEUX,Seasonal-7,67.288,86.476572,87.130333
6,MINCI DELICE,ETS-mul,734.09308,916.392894,32.411028
7,MINCI DELICE,Naive-1,692.016,872.844329,36.36013
8,MINCI DELICE,Seasonal-7,910.768,1243.399795,72.091844
9,NUTRAVANCE,Naive-1,29.232,47.03903,40.893553


In [59]:
per_fold, summary = run_candidates_per_customer(
    df_clean,
    model_matrix_path="configs/model_matrix.yaml",
    n_folds=5,
    window_type="expanding",
    step_days=7,
    horizon_days=25,
    gap_days=0,
    max_lag=30,
    roll_windows=[7,14,30],
    holiday_country="FR",
    holiday_subdiv_map=None,
    holiday_window=3,
    trim_by_history=True,
    dropna_mode="none",
    out_dir="outputs/cv/candidates",
    save_csv=True,
)

display(per_fold.head(12))
display(summary)

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 31, number of used features: 0
[LightGBM] [Info] Start training from score 4.567978
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 38, number of used features: 0
[LightGBM] [Info] Start training from score 4.494536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 718
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 52
[LightGBM] [Info] Start training from score 4.402196
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 822
[LightGBM] [Info] Number of data points in the train set:



Unnamed: 0,CUSTOMER,fold,anchor,model,MAE,RMSE,sMAPE,n,error
0,ARGALYS,1,2020-10-25,ARIMA_ERROR,,,,25,fit_forecast_arima() missing 3 required keywor...
1,ARGALYS,2,2020-11-01,ARIMA_ERROR,,,,25,fit_forecast_arima() missing 3 required keywor...
2,ARGALYS,3,2020-11-08,ARIMA_ERROR,,,,25,fit_forecast_arima() missing 3 required keywor...
3,ARGALYS,4,2020-11-15,ARIMA_ERROR,,,,25,fit_forecast_arima() missing 3 required keywor...
4,ARGALYS,5,2020-11-22,ARIMA_ERROR,,,,25,fit_forecast_arima() missing 3 required keywor...
5,ARGALYS,1,2020-10-25,ETS-Add,7.138848,9.96671,58.31165,25,
6,ARGALYS,2,2020-11-01,ETS-Add,9.098311,12.396143,60.673097,25,
7,ARGALYS,3,2020-11-08,ETS-Add,11.218919,14.765265,64.161226,25,
8,ARGALYS,4,2020-11-15,ETS-Add,11.527873,14.97198,63.324246,25,
9,ARGALYS,5,2020-11-22,ETS-Add,14.018408,16.10738,59.959565,25,


Unnamed: 0,CUSTOMER,model,MAE,RMSE,sMAPE
0,ARGALYS,ETS-Add,10.600472,13.641496,61.285957
1,ARGALYS,ARIMA_ERROR,,,
2,ARGALYS,XGB_ERROR,,,
3,LES MIRACULEUX,LGBM,50.378678,70.990118,41.496728
4,LES MIRACULEUX,ProphetMul_ERROR,,,
5,LES MIRACULEUX,SARIMA_111_111_12_ERROR,,,
6,MINCI DELICE,HW-Mul,591.791816,711.287693,29.089147
7,MINCI DELICE,ProphetMul_ERROR,,,
8,MINCI DELICE,SARIMA_weekly_ERROR,,,
9,NUTRAVANCE,ETS-Mul,38.163835,46.689076,82.984968


# Update 1 to get rid of Error in models

In [63]:
from src.models.arima_like import fit_forecast_sarima
from src.models.prophet_model import fit_predict_prophet
from src.models.gbm import fit_predict_gbm_once  
from src.utils.ml_safety import clean_design, align_like, has_enough_rows

In [17]:
per_fold, summary = run_candidates_per_customer(
    df_clean,
    model_matrix_path="configs/model_matrix.yaml",
    n_folds=5,
    window_type="expanding",
    step_days=7,
    horizon_days=25,
    gap_days=0,
    max_lag=30,
    roll_windows=[7,14,30],
    holiday_country="FR",
    holiday_subdiv_map=None,
    holiday_window=3,
    trim_by_history=True,
    dropna_mode="none",
    out_dir="outputs/cv/candidates",
    save_csv=True,
)

display(per_fold.head(12))
display(summary)

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 31, number of used features: 0
[LightGBM] [Info] Start training from score 8.274194
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 38, number of used features: 0
[LightGBM] [Info] Start training from score 8.315789
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000088 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 569
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 45
[LightGBM] [Info] Start training from score 8.255556
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000179 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 651
[LightGBM] [Info] Number of data points in the train set:



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000440 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 706
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 47
[LightGBM] [Info] Start training from score 73.844444
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 817
[LightGBM] [Info] Number of data points in the train set: 52, number of used features: 50
[LightGBM] [Info] Start training from score 71.634615
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 929
[LightGBM] [Info] Number of data points in the train set: 59, number of used features: 54
[LightGBM] [Info] Start training from

Unnamed: 0,CUSTOMER,fold,anchor,model,MAE,RMSE,sMAPE,n
0,ARGALYS,1,2020-10-25,ARIMA,12.4,16.110935,200.0,25
1,ARGALYS,2,2020-11-01,ARIMA,16.06,19.541551,200.0,25
2,ARGALYS,3,2020-11-08,ARIMA,19.82,22.706326,200.0,25
3,ARGALYS,4,2020-11-15,ARIMA,21.12,23.590782,200.0,25
4,ARGALYS,5,2020-11-22,ARIMA,21.96,25.577866,200.0,25
5,ARGALYS,1,2020-10-25,ETS-Add,7.138848,9.96671,58.31165,25
6,ARGALYS,2,2020-11-01,ETS-Add,9.098311,12.396143,60.673097,25
7,ARGALYS,3,2020-11-08,ETS-Add,11.218919,14.765265,64.161226,25
8,ARGALYS,4,2020-11-15,ETS-Add,11.527873,14.97198,63.324246,25
9,ARGALYS,5,2020-11-22,ETS-Add,14.018408,16.10738,59.959565,25


Unnamed: 0,CUSTOMER,model,MAE,RMSE,sMAPE
0,ARGALYS,ETS-Add,10.600472,13.641496,61.285957
1,ARGALYS,LGBM,11.11884,14.389494,71.378627
2,ARGALYS,ARIMA,18.272,21.505492,200.0
3,LES MIRACULEUX,LGBM,50.378678,70.990118,41.496728
4,LES MIRACULEUX,ProphetMul,60.069523,81.355826,55.107655
5,LES MIRACULEUX,SARIMA_111_111_12,70.448694,92.33023,55.435599
6,MINCI DELICE,SARIMA_weekly,568.198736,677.427921,28.497166
7,MINCI DELICE,HW-Mul,591.791816,711.287693,29.089147
8,MINCI DELICE,ProphetMul,598.138889,733.266627,30.509377
9,NUTRAVANCE,ETS-Mul,38.163835,46.689076,82.984968
