In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
import pandas as pd
from src.features.future_features import build_future_features
from src.utils.timeseries_split import (
    compute_min_hist, rolling_time_series_cv, select_by_index
)
from src.features.build_features import build_features
from src.pipeline.per_customer import per_customer_cv
import numpy as np
from src.data.preprocess import preprocess_all_customers
from src.data.loader import load_raw, reindex_daily

In [34]:
df = load_raw("data/raw/train set.csv") 

In [35]:
df_clean, summary = preprocess_all_customers(
    df,
    long_gap_days=30,
    min_nonzero_run=5,
    min_nonzero_value=1.0,
    gap_limit=7,
    causal=False,
    verbose=True,
)

print("Cleaned dataset shape:", df_clean.shape)
print(summary[["CUSTOMER", "inactive_lead_days", "clean_start", "clean_end"]])

[clean_and_truncate_series] {'active': True, 'orig_len': 1402, 'clean_len': 1041, 'active_days': 1041, 'orig_start': Timestamp('2019-08-01 00:00:00'), 'orig_end': Timestamp('2023-06-02 00:00:00'), 'clean_start': Timestamp('2020-07-27 00:00:00'), 'clean_end': Timestamp('2023-06-02 00:00:00'), 'inactive_lead_days': 361, 'inactive_tail_days': 11, 'gap_limit': 7, 'causal': False, 'min_nonzero_run': 5, 'min_nonzero_value': 1.0}
[clean_and_truncate_series] {'active': True, 'orig_len': 1485, 'clean_len': 1272, 'active_days': 1272, 'orig_start': Timestamp('2019-05-10 00:00:00'), 'orig_end': Timestamp('2023-06-02 00:00:00'), 'clean_start': Timestamp('2019-12-09 00:00:00'), 'clean_end': Timestamp('2023-06-02 00:00:00'), 'inactive_lead_days': 213, 'inactive_tail_days': 11, 'gap_limit': 7, 'causal': False, 'min_nonzero_run': 5, 'min_nonzero_value': 1.0}
[clean_and_truncate_series] {'active': True, 'orig_len': 2072, 'clean_len': 2072, 'active_days': 2072, 'orig_start': Timestamp('2017-09-30 00:00:0

In [36]:
# CV Config
HORIZON_DAYS = 25
STEP_DAYS    = 7
N_FOLDS      = 5        
WINDOW_TYPE  = "expanding"

# Features Config
MAX_LAG      = 30
ROLL_WINDOWS = [7, 14, 30]
MIN_HIST     = compute_min_hist(MAX_LAG, ROLL_WINDOWS)

# Holiday Config
HOLIDAY_COUNTRY = "FR"
HOLIDAY_SUBDIV_MAP = None
HOLIDAY_WINDOW = 3

In [37]:
def build_feat(df_slice: pd.DataFrame) -> pd.DataFrame:
    return build_features(
        df_slice,
        max_lag=MAX_LAG,
        roll_windows=ROLL_WINDOWS,
        holiday_country=HOLIDAY_COUNTRY,
        holiday_subdiv_map=HOLIDAY_SUBDIV_MAP,
        holiday_window=HOLIDAY_WINDOW,
        trim_by_history=True,   # ensures lag_1 exists in val
        dropna_mode="none",
    )

def naive_last_value_baseline(X_frame: pd.DataFrame) -> np.ndarray:
    if "lag_1" not in X_frame.columns:
        raise KeyError("lag_1 not found in features — check trim_by_history / max_lag.")
    return X_frame["lag_1"].fillna(0).to_numpy()

def smape(y_true, y_pred, eps=1e-8):
    num = np.abs(y_pred - y_true)
    den = (np.abs(y_true) + np.abs(y_pred)).clip(min=eps)
    return 100.0 * np.mean(2.0 * num / den)

def mae(y_true, y_pred):  return float(np.mean(np.abs(y_true - y_pred)))
def rmse(y_true, y_pred): return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

In [38]:
all_rows = []

for cust in df_clean["CUSTOMER"].dropna().unique():
    df_c = df_clean[df_clean["CUSTOMER"] == cust].sort_values("DATE").reset_index(drop=True)
    if df_c.empty:
        continue

    # Build folds on THIS customer's timeline
    folds = rolling_time_series_cv(
        df_c,
        n_folds=N_FOLDS,
        window_type=WINDOW_TYPE,      # or "sliding" with train_window_days=365
        train_window_days=365,
        step_days=STEP_DAYS,
        horizon_days=HORIZON_DAYS,
        gap_days=0,
        by_customer=True,             # still True (single-customer df)
        min_hist=MIN_HIST             # uses global _hist_len (make sure split util is patched)
    )

    if not folds:
        all_rows.append({"CUSTOMER": cust, "fold": None, "anchor": None,
                         "n": 0, "MAE": np.nan, "RMSE": np.nan, "sMAPE": np.nan})
        continue

    for f in folds:
        # select by indices within the single-customer df
        train_df = select_by_index(df_c, f.train_idx)
        val_df   = select_by_index(df_c, f.val_idx)

        Xy_tr = build_feat(train_df)
        Xy_va = build_feat(val_df)
        if Xy_va.empty:
            continue

        drop_cols = ["DATE","CUSTOMER","QUANTITY"]
        X_val = Xy_va.drop(columns=drop_cols)
        y_val = Xy_va["QUANTITY"].to_numpy()

        y_hat = naive_last_value_baseline(X_val)

        all_rows.append({
            "CUSTOMER": cust,
            "fold": f.fold,
            "anchor": f.meta["anchor"].date(),
            "n": len(y_val),
            "MAE":  mae(y_val, y_hat),
            "RMSE": rmse(y_val, y_hat),
            "sMAPE": smape(y_val, y_hat),
        })

per_fold_per_customer = pd.DataFrame(all_rows).sort_values(["CUSTOMER","fold"]).reset_index(drop=True)
display(per_fold_per_customer)

Unnamed: 0,CUSTOMER,fold,anchor,n,MAE,RMSE,sMAPE
0,ARGALYS,1,2020-10-25,25,6.48,7.793159,64.499231
1,ARGALYS,2,2020-11-01,25,6.96,8.557648,54.432514
2,ARGALYS,3,2020-11-08,25,8.04,9.590794,49.735365
3,ARGALYS,4,2020-11-15,25,9.98,12.075595,55.306101
4,ARGALYS,5,2020-11-22,25,12.24,14.502529,65.031821
5,LES MIRACULEUX,1,2020-03-08,25,48.2,65.777352,58.370227
6,LES MIRACULEUX,2,2020-03-15,25,43.52,56.904833,47.904812
7,LES MIRACULEUX,3,2020-03-22,25,61.36,83.534185,46.068433
8,LES MIRACULEUX,4,2020-03-29,25,67.24,92.151035,45.118265
9,LES MIRACULEUX,5,2020-04-05,25,67.04,86.71432,45.929559


In [39]:
per_customer_mean = (
    per_fold_per_customer
    .dropna(subset=["fold"])
    .groupby("CUSTOMER", as_index=False)[["MAE","RMSE","sMAPE"]]
    .mean()
    .sort_values("CUSTOMER")
    .reset_index(drop=True)
)
display(per_customer_mean)

Unnamed: 0,CUSTOMER,MAE,RMSE,sMAPE
0,ARGALYS,8.74,10.503945,57.801006
1,LES MIRACULEUX,57.472,77.016345,48.678259
2,MINCI DELICE,692.016,872.844329,36.36013
3,NUTRAVANCE,29.232,47.03903,40.893553
