In [4]:
# ============================================================================
# Multi-Horizon LightGBM
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import pickle
import warnings
from tqdm import tqdm
import lightgbm as lgb
from m5_wrmsse import wrmsse
import gc
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')

# ============================================================================
# 1. SETUP
# ============================================================================

print("="*80)
print("MULTI-HORIZON LIGHTGBM")
print("="*80)

DATA_DIR = Path("../data/processed")
RAW_DIR = Path("../data/raw")
OUTPUT_DIR = Path("../data/multihorizon_results")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ============================================================================
# 2. CARICA DATI
# ============================================================================

print("\n[1/5] Caricamento dati...")

with open(DATA_DIR / "train_official.pkl", 'rb') as f:
    train = pickle.load(f)

print(f"✓ Train: {train.shape}")

# ============================================================================
# 3. FEATURE ENGINEERING
# ============================================================================

print("\n[2/5] Feature engineering...")

train = train.sort_values(['id', 'date']).reset_index(drop=True)

# Date features
train['dayofweek'] = train['date'].dt.dayofweek
train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year
train['is_weekend'] = (train['dayofweek'] >= 5).astype(int)

print("  Aggregations (2 levels)...")
# Level 1: Item (tutti gli store)
item_agg = train.groupby(['item_id', 'date'])['sales'].mean().reset_index()
item_agg.columns = ['item_id', 'date', 'sales_item']
train = train.merge(item_agg, on=['item_id', 'date'], how='left')

# Level 2: Dept-Store
dept_agg = train.groupby(['dept_id', 'store_id', 'date'])['sales'].mean().reset_index()
dept_agg.columns = ['dept_id', 'store_id', 'date', 'sales_dept']
train = train.merge(dept_agg, on=['dept_id', 'store_id', 'date'], how='left')

del item_agg, dept_agg
gc.collect()

# Lag features 
print("  Lag features...")
for lag in tqdm([7, 28], desc="    Lags"):
    train[f'lag_{lag}'] = train.groupby('id')['sales'].shift(lag)
    train[f'lag_item_{lag}'] = train.groupby('item_id')['sales_item'].shift(lag)

# Rolling features
print("  Rolling features...")
for window in tqdm([7, 28], desc="    Rolling"):
    train[f'roll_{window}'] = train.groupby('id')['sales'].transform(
        lambda x: x.shift(1).rolling(window).mean()
    )

# Encoding
train['id_encoded'] = pd.factorize(train['id'])[0]
train['dept_encoded'] = pd.factorize(train['dept_id'])[0]

print(f"✓ Features: {train.shape}")

# ============================================================================
# 4. TRAIN/TEST SPLIT
# ============================================================================

print("\n[3/5] Train/test split...")

# Usa solo 1 anno di training (più veloce, comunque efficace)
DAYS_HISTORY = 365
train['day_num'] = (train['date'] - train['date'].min()).dt.days
max_day = train['day_num'].max()

# Train
train_start = max_day - DAYS_HISTORY
train_end = max_day - 28

X_train = train[(train['day_num'] >= train_start) & 
                (train['day_num'] <= train_end)].copy()
X_test = train[train['day_num'] > train_end].copy()

print(f"Train: {X_train.shape}")
print(f"Test: {X_test.shape}")

# Features
feature_cols = [
    'dayofweek', 'day', 'month', 'is_weekend',
    'id_encoded', 'dept_encoded',
    'sales_item', 'sales_dept',
    'lag_7', 'lag_28',
    'lag_item_7', 'lag_item_28',
    'roll_7', 'roll_28'
]

print(f"Features: {len(feature_cols)}")

# Dropna
X_train = X_train.dropna(subset=feature_cols)
X_test = X_test.dropna(subset=feature_cols)

print(f"After dropna - Train: {X_train.shape}, Test: {X_test.shape}")

del train
gc.collect()

# ============================================================================
# 5. TRAIN 28 MODELS
# ============================================================================

print("\n[4/5] Training 28 models...")

params = {
    'objective': 'poisson',
    'metric': 'rmse',
    'learning_rate': 0.1,  
    'num_leaves': 31,
    'max_depth': 6,        
    'feature_fraction': 0.8,
    'verbose': -1,
    'n_jobs': -1
}

models = {}

for day in tqdm(range(1, 29), desc="Training"):
    # Shift features per questo forecast day
    X_train_day = X_train.copy()
    
    for col in feature_cols:
        if 'lag' in col or 'roll' in col or col in ['sales_item', 'sales_dept']:
            X_train_day[col] = X_train.groupby('id')[col].shift(day)
    
    # Dropna
    X_train_day = X_train_day.dropna(subset=feature_cols)
    
    # Train
    y_train_day = X_train_day['sales']
    X_train_features = X_train_day[feature_cols]
    
    lgb_train = lgb.Dataset(X_train_features, y_train_day)
    
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=100,  # Ridotto per velocità
        callbacks=[lgb.log_evaluation(period=0)]
    )
    
    models[f'F{day}'] = model  # ← CHANGED: day_X → FX
    
    del X_train_day, y_train_day, X_train_features, lgb_train
    gc.collect()

print(f"\n✓ {len(models)} models trained!")

# ============================================================================
# 6. FORECAST
# ============================================================================

print("\n[5/5] Forecasting...")

# Carica ordine serie
sales_orig = pd.read_csv(RAW_DIR / "sales_train_evaluation.csv")
series_order = sales_orig['id'].tolist()

# Qui inserisci:
train_days = 1913
test_days = 28
test_cols = [f'd_{i}' for i in range(train_days + 1, train_days + test_days + 1)]

test_actuals = sales_orig[['id'] + test_cols].copy()
test_actuals.columns = ['id'] + [f'F{i}' for i in range(1, test_days + 1)]

# Per ogni giorno, forecast
all_forecasts = []

for day in tqdm(range(1, 29), desc="Predicting"):
    # Test per questo giorno
    test_day = X_test[X_test['day_num'] == (max_day - 28 + day)].copy()
    
    if len(test_day) == 0:
        # Fallback: usa ultimo giorno disponibile
        test_day = X_test.copy()
    
    X_test_features = test_day[feature_cols].fillna(0)
    
    model = models[f'F{day}']  
    preds = model.predict(X_test_features)
    preds = np.maximum(preds, 0)
    
    # Map predictions to series
    test_day['forecast'] = preds
    day_forecasts = test_day.groupby('id')['forecast'].first()
    
    all_forecasts.append(day_forecasts)

# Combine in DataFrame (30490, 28)
forecast_df = pd.DataFrame(all_forecasts).T
forecast_df.columns = [f'F{i}' for i in range(1, 29)]  # ← CHANGED: day_X → FX

# Reindex
forecast_df = forecast_df.reindex(series_order).fillna(0)
forecast_array = forecast_df.values

print(f"✓ Forecast array: {forecast_array.shape}")
print(f"✓ Forecast columns: {forecast_df.columns.tolist()[:5]}...")  # Debug

# ============================================================================
# 7. WRMSSE
# ============================================================================

print("\nCalcolo WRMSSE...")

wrmsse_score = wrmsse(forecast_array)

print(f"\n✅ WRMSSE: {wrmsse_score:.4f}")

# ============================================================================
# 8. SALVATAGGIO
# ============================================================================

forecast_df_with_id = forecast_df.copy()
forecast_df_with_id['id'] = series_order
forecast_df_with_id = forecast_df_with_id[['id'] + [f'F{i}' for i in range(1, 29)]]

forecast_df_with_id.to_pickle(OUTPUT_DIR / 'multihorizon_forecasts.pkl')
print(f"✓ Salvato: multihorizon_forecasts.pkl con colonne FX")

summary = {
    'wrmsse': wrmsse_score,
    'n_models': 28,
    'approach': 'multi-horizon'
}

with open(OUTPUT_DIR / 'multihorizon_summary.pkl', 'wb') as f:
    pickle.dump(summary, f)

print("\n" + "="*80)
print("COMPLETATO ✅")

MULTI-HORIZON LIGHTGBM

[1/5] Caricamento dati...
✓ Train: (58327370, 11)

[2/5] Feature engineering...
  Aggregations (2 levels)...
  Lag features...


    Lags: 100%|███████████████████████████████████| 2/2 [00:05<00:00,  2.74s/it]


  Rolling features...


    Rolling: 100%|████████████████████████████████| 2/2 [00:09<00:00,  4.98s/it]


✓ Features: (58327370, 26)

[3/5] Train/test split...
Train: (10305620, 26)
Test: (853720, 26)
Features: 14
After dropna - Train: (10305620, 26), Test: (853720, 26)

[4/5] Training 28 models...


Training: 100%|█████████████████████████████████| 28/28 [04:45<00:00, 10.19s/it]



✓ 28 models trained!

[5/5] Forecasting...


Predicting: 100%|███████████████████████████████| 28/28 [00:00<00:00, 37.48it/s]


✓ Forecast array: (30490, 28)
✓ Forecast columns: ['F1', 'F2', 'F3', 'F4', 'F5']...

Calcolo WRMSSE...

✅ WRMSSE: 0.7273
✓ Salvato: multihorizon_forecasts.pkl con colonne FX

COMPLETATO ✅
