In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import pandas as pd
from src.features.future_features import build_future_features
from src.utils.timeseries_split import (
    compute_min_hist, rolling_time_series_cv, select_by_index
)
from src.features.build_features import build_features
from src.pipeline.per_customer import per_customer_cv
import numpy as np
from src.data.preprocess import preprocess_all_customers
from src.data.loader import load_raw, reindex_daily
from src.eval.run_baselines import run_baselines_per_customer
from src.eval.run_candidates import run_candidates_per_customer

In [10]:
df = load_raw("data/raw/train set.csv") 

In [11]:
df_clean, summary = preprocess_all_customers(
    df,
    long_gap_days=30,
    min_nonzero_run=5,
    min_nonzero_value=1.0,
    gap_limit=7,
    causal=False,
    verbose=True,
)

print("Cleaned dataset shape:", df_clean.shape)
print(summary[["CUSTOMER", "inactive_lead_days", "clean_start", "clean_end"]])

[clean_and_truncate_series] {'active': True, 'orig_len': 1402, 'clean_len': 1041, 'active_days': 1041, 'orig_start': Timestamp('2019-08-01 00:00:00'), 'orig_end': Timestamp('2023-06-02 00:00:00'), 'clean_start': Timestamp('2020-07-27 00:00:00'), 'clean_end': Timestamp('2023-06-02 00:00:00'), 'inactive_lead_days': 361, 'inactive_tail_days': 11, 'gap_limit': 7, 'causal': False, 'min_nonzero_run': 5, 'min_nonzero_value': 1.0}
[clean_and_truncate_series] {'active': True, 'orig_len': 1485, 'clean_len': 1272, 'active_days': 1272, 'orig_start': Timestamp('2019-05-10 00:00:00'), 'orig_end': Timestamp('2023-06-02 00:00:00'), 'clean_start': Timestamp('2019-12-09 00:00:00'), 'clean_end': Timestamp('2023-06-02 00:00:00'), 'inactive_lead_days': 213, 'inactive_tail_days': 11, 'gap_limit': 7, 'causal': False, 'min_nonzero_run': 5, 'min_nonzero_value': 1.0}
[clean_and_truncate_series] {'active': True, 'orig_len': 2072, 'clean_len': 2072, 'active_days': 2072, 'orig_start': Timestamp('2017-09-30 00:00:0

In [12]:
# CV Config
HORIZON_DAYS = 25
STEP_DAYS    = 7
N_FOLDS      = 5        
WINDOW_TYPE  = "expanding"

# Features Config
MAX_LAG      = 30
ROLL_WINDOWS = [7, 14, 30]
MIN_HIST     = compute_min_hist(MAX_LAG, ROLL_WINDOWS)

# Holiday Config
HOLIDAY_COUNTRY = "FR"
HOLIDAY_SUBDIV_MAP = None
HOLIDAY_WINDOW = 3

In [13]:
per_fold, summary = run_baselines_per_customer(
    df_clean,
    n_folds=N_FOLDS,
    window_type=WINDOW_TYPE,
    step_days=STEP_DAYS,
    horizon_days=HORIZON_DAYS,
    gap_days=0,
    max_lag=MAX_LAG,
    roll_windows=ROLL_WINDOWS,
    holiday_country=HOLIDAY_COUNTRY,
    holiday_subdiv_map=HOLIDAY_SUBDIV_MAP,
    holiday_window=HOLIDAY_WINDOW,
    trim_by_history=True,
    dropna_mode="none",
    out_dir="outputs/cv",
    save_csv=True,
)

display(per_fold.head(12))
display(summary)



Unnamed: 0,CUSTOMER,fold,anchor,model,MAE,RMSE,sMAPE,n
0,ARGALYS,1,2023-03-04,ETS-add,25.705381,29.428582,48.66281,25
1,ARGALYS,2,2023-03-11,ETS-add,18.133214,23.545748,35.299416,25
2,ARGALYS,3,2023-03-18,ETS-add,22.799923,25.372977,43.255715,25
3,ARGALYS,4,2023-03-25,ETS-add,12.534101,16.346749,28.069924,25
4,ARGALYS,5,2023-04-01,ETS-add,22.569584,24.475657,45.076314,25
5,ARGALYS,1,2023-03-04,Naive-1,30.866667,33.769085,55.596179,25
6,ARGALYS,2,2023-03-11,Naive-1,19.6,28.874749,36.246214,25
7,ARGALYS,3,2023-03-18,Naive-1,30.186667,32.861866,54.1268,25
8,ARGALYS,4,2023-03-25,Naive-1,14.826667,21.203773,32.381938,25
9,ARGALYS,5,2023-04-01,Naive-1,29.826667,32.372485,55.378711,25


Unnamed: 0,CUSTOMER,model,MAE,RMSE,sMAPE
0,ARGALYS,Seasonal-7,19.314667,25.176275,37.209306
1,ARGALYS,ETS-add,20.34844,23.833943,40.072836
2,ARGALYS,Naive-1,25.061333,29.816392,46.745969
3,LES MIRACULEUX,Seasonal-7,283.870667,368.125204,25.037241
4,LES MIRACULEUX,ETS-mul,291.216893,367.191639,26.392761
5,LES MIRACULEUX,Naive-1,459.245333,525.871689,40.47557
6,MINCI DELICE,ETS-mul,903.428635,1062.305199,26.983943
7,MINCI DELICE,Seasonal-7,974.861333,1212.288632,30.973914
8,MINCI DELICE,Naive-1,1004.626667,1268.315731,32.602518
9,NUTRAVANCE,Seasonal-7,68.513333,84.589627,58.669643


In [14]:
per_fold, summary = run_candidates_per_customer(
    df_clean,
    model_matrix_path="configs/model_matrix.yaml",
    n_folds=7,
    window_type="expanding",
    step_days=7,
    horizon_days=25,
    gap_days=0,
    max_lag=30,
    roll_windows=[7,14,30],
    holiday_country="FR",
    holiday_subdiv_map=None,
    holiday_window=3,
    trim_by_history=True,
    dropna_mode="none",
    out_dir="outputs/cv/candidates",
    save_csv=True,
)

display(per_fold.head(12))
display(summary)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000499 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6049
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 62
[LightGBM] [Info] Start training from score 29.936027
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6069
[LightGBM] [Info] Number of data points in the train set: 898, number of used features: 62
[LightGBM] [Info] Start training from score 30.093541
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000550 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6085
[LightGBM] [Info] Number of data points in the train set: 905, number of used features: 62
[LightGBM] [Info] Start trainin



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000732 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6232
[LightGBM] [Info] Number of data points in the train set: 933, number of used features: 62
[LightGBM] [Info] Start training from score 30.810289




[WARN] Prophet backend error for LES MIRACULEUX. Skipping Prophet.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11059
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 62
[LightGBM] [Info] Start training from score 6.098719




[WARN] Prophet backend error for LES MIRACULEUX. Skipping Prophet.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000632 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11063
[LightGBM] [Info] Number of data points in the train set: 898, number of used features: 62
[LightGBM] [Info] Start training from score 6.106234




[WARN] Prophet backend error for LES MIRACULEUX. Skipping Prophet.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000661 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11065
[LightGBM] [Info] Number of data points in the train set: 905, number of used features: 62
[LightGBM] [Info] Start training from score 6.113423




[WARN] Prophet backend error for LES MIRACULEUX. Skipping Prophet.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000648 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11072
[LightGBM] [Info] Number of data points in the train set: 912, number of used features: 62
[LightGBM] [Info] Start training from score 6.119512




[WARN] Prophet backend error for LES MIRACULEUX. Skipping Prophet.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11076
[LightGBM] [Info] Number of data points in the train set: 919, number of used features: 62
[LightGBM] [Info] Start training from score 6.124435




[WARN] Prophet backend error for LES MIRACULEUX. Skipping Prophet.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11079
[LightGBM] [Info] Number of data points in the train set: 926, number of used features: 62
[LightGBM] [Info] Start training from score 6.130249




[WARN] Prophet backend error for LES MIRACULEUX. Skipping Prophet.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11082
[LightGBM] [Info] Number of data points in the train set: 933, number of used features: 62
[LightGBM] [Info] Start training from score 6.135789




[WARN] Prophet backend error for MINCI DELICE. Skipping Prophet.




[WARN] Prophet backend error for MINCI DELICE. Skipping Prophet.




[WARN] Prophet backend error for MINCI DELICE. Skipping Prophet.




[WARN] Prophet backend error for MINCI DELICE. Skipping Prophet.




[WARN] Prophet backend error for MINCI DELICE. Skipping Prophet.




[WARN] Prophet backend error for MINCI DELICE. Skipping Prophet.




[WARN] Prophet backend error for MINCI DELICE. Skipping Prophet.




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000697 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10837
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 62
[LightGBM] [Info] Start training from score 4.324214




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000698 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10836
[LightGBM] [Info] Number of data points in the train set: 898, number of used features: 62
[LightGBM] [Info] Start training from score 4.325476




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000686 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10828
[LightGBM] [Info] Number of data points in the train set: 905, number of used features: 62
[LightGBM] [Info] Start training from score 4.330298




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10823
[LightGBM] [Info] Number of data points in the train set: 912, number of used features: 62
[LightGBM] [Info] Start training from score 4.332341




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10824
[LightGBM] [Info] Number of data points in the train set: 919, number of used features: 62
[LightGBM] [Info] Start training from score 4.332903




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000662 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10839
[LightGBM] [Info] Number of data points in the train set: 926, number of used features: 62
[LightGBM] [Info] Start training from score 4.338286




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10850
[LightGBM] [Info] Number of data points in the train set: 933, number of used features: 62
[LightGBM] [Info] Start training from score 4.336585


Unnamed: 0,CUSTOMER,fold,anchor,model,MAE,RMSE,sMAPE,n
0,ARGALYS,1,2023-03-04,ARIMA,18.827211,22.215159,37.911303,25
1,ARGALYS,2,2023-03-11,ARIMA,19.234811,23.545821,35.489678,25
2,ARGALYS,3,2023-03-18,ARIMA,18.877841,23.473813,36.469244,25
3,ARGALYS,4,2023-03-25,ARIMA,16.619214,20.299354,35.414318,25
4,ARGALYS,5,2023-04-01,ARIMA,16.872136,20.778088,35.029072,25
5,ARGALYS,6,2023-04-08,ARIMA,18.413524,21.654446,43.767677,25
6,ARGALYS,7,2023-04-15,ARIMA,18.159794,21.473235,44.480728,25
7,ARGALYS,1,2023-03-04,ETS-Add,25.776801,29.571548,48.744228,25
8,ARGALYS,2,2023-03-11,ETS-Add,18.179473,23.497045,35.384216,25
9,ARGALYS,3,2023-03-18,ETS-Add,22.894059,25.504009,43.386257,25


Unnamed: 0,CUSTOMER,model,MAE,RMSE,sMAPE
0,ARGALYS,LGBM,17.358564,22.228307,35.272153
1,ARGALYS,ARIMA,18.143504,21.919988,38.366003
2,ARGALYS,ETS-Add,18.530042,21.997488,38.530154
3,LES MIRACULEUX,ProphetMul,303.821214,357.266826,26.62575
4,LES MIRACULEUX,SARIMA_111_111_12,413.168255,466.140219,37.689406
5,LES MIRACULEUX,LGBM,500.236167,558.689249,41.140441
6,MINCI DELICE,ETS-Mul,738.903307,905.746082,22.682864
7,MINCI DELICE,ProphetMul,728.992261,842.824579,23.740846
8,MINCI DELICE,SARIMA_weekly,787.800669,936.800388,25.593067
9,NUTRAVANCE,ETS-Add,63.565107,76.321157,50.851235
