In [2]:
# ============================================================================
# LightGBM 
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pickle
import warnings
from tqdm import tqdm
import lightgbm as lgb
from m5_wrmsse import wrmsse
import gc
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')

# ============================================================================
# 1. SETUP
# ============================================================================

print("="*80)
print("LIGHTGBM")
print("="*80)

DATA_DIR = Path("../data/processed")
RAW_DIR = Path("../data/raw")
OUTPUT_DIR = Path("../data/lightgbm_results")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"LightGBM: {lgb.__version__}")

# ============================================================================
# 2. CARICA DATI
# ============================================================================

print("\n[1/6] Caricamento dati...")

with open(DATA_DIR / "train_official.pkl", 'rb') as f:
    train = pickle.load(f)

print(f"✓ Train: {train.shape}")

# ============================================================================
# 3. FEATURE ENGINEERING
# ============================================================================

print("\n[2/6] Feature engineering")

# Sort
train = train.sort_values(['id', 'date']).reset_index(drop=True)

# Date features
print("  Date features...")
train['dayofweek'] = train['date'].dt.dayofweek
train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year
train['is_weekend'] = (train['dayofweek'] >= 5).astype(int)

# Lag features 
print("  Lag features...")
for lag in tqdm([28, 35, 42], desc="    Lags"):
    train[f'lag_{lag}'] = train.groupby('id')['sales'].shift(lag)

# Rolling features
print("  Rolling features...")
for window in tqdm([7, 28], desc="    Rolling"):
    train[f'roll_mean_{window}'] = train.groupby('id')['sales'].transform(
        lambda x: x.shift(28).rolling(window).mean()
    )

# ID encoding
print("  ID encoding...")
train['id_encoded'] = pd.factorize(train['id'])[0]

print(f"✓ Features create: {train.shape}")

# ============================================================================
# 4. TRAIN/TEST SPLIT
# ============================================================================

print("\n[3/6] Preparazione train/test...")

# Ultimi 28 giorni per test
train['day_num'] = (train['date'] - train['date'].min()).dt.days

TRAIN_END = train['day_num'].max() - 28

X_train = train[train['day_num'] <= TRAIN_END].copy()
X_test = train[train['day_num'] > TRAIN_END].copy()

print(f"Train: {X_train.shape} (fino giorno {TRAIN_END})")
print(f"Test:  {X_test.shape} (ultimi 28 giorni)")

# Features
feature_cols = [
    'dayofweek', 'day', 'month', 'year', 'is_weekend', 'id_encoded',
    'lag_28', 'lag_35', 'lag_42',
    'roll_mean_7', 'roll_mean_28'
]

print(f"\nFeatures: {len(feature_cols)}")

# Rimuovi NaN
X_train = X_train.dropna(subset=feature_cols)
X_test = X_test.dropna(subset=feature_cols)

print(f"Dopo dropna - Train: {X_train.shape}, Test: {X_test.shape}")

# Separa
y_train = X_train['sales']
X_train_features = X_train[feature_cols]

y_test = X_test['sales']
X_test_features = X_test[feature_cols]

print(f"\n✅ X_train: {X_train_features.shape}")
print(f"✅ X_test: {X_test_features.shape}")

# ============================================================================
# 5. LIGHTGBM TRAINING
# ============================================================================

print("\n[4/6] Training LightGBM...")

params = {
    'objective': 'poisson',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'verbose': -1,
    'n_jobs': -1
}

lgb_train = lgb.Dataset(X_train_features, y_train)
lgb_test = lgb.Dataset(X_test_features, y_test, reference=lgb_train)

print("\n🚀 Training...")

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=500,
    valid_sets=[lgb_train, lgb_test],
    valid_names=['train', 'test'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=30),
        lgb.log_evaluation(period=50)
    ]
)

print(f"\n✅ Training completato!")
print(f"  Best iteration: {model.best_iteration}")
print(f"  Best RMSE: {model.best_score['test']['rmse']:.2f}")

# Feature importance
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importance()
}).sort_values('importance', ascending=False)

print("\nTop 10 features:")
print(importance.head(10).to_string(index=False))

# ============================================================================
# 6. FORECAST SU TUTTE LE SERIE
# ============================================================================

print("\n[5/6] Generazione forecasts per submission...")

# Carica ordine originale
sales_orig = pd.read_csv(RAW_DIR / "sales_train_evaluation.csv")
series_order = sales_orig['id'].tolist()

print(f"Serie totali: {len(series_order)}")

# Carica eval data
with open(DATA_DIR / "eval_official.pkl", 'rb') as f:
    eval_data = pickle.load(f)

# Prepara eval data con stesse features
print("  Preparazione eval data...")
eval_data = eval_data.sort_values(['id', 'date']).reset_index(drop=True)
eval_data['dayofweek'] = eval_data['date'].dt.dayofweek
eval_data['day'] = eval_data['date'].dt.day
eval_data['month'] = eval_data['date'].dt.month
eval_data['year'] = eval_data['date'].dt.year
eval_data['is_weekend'] = (eval_data['dayofweek'] >= 5).astype(int)
eval_data['id_encoded'] = pd.factorize(eval_data['id'])[0]

# Crea dizionario per ultimi valori training per serie 
print("  Preparazione ultimi valori per lag...")
last_values_dict = (
    X_train.groupby('id')['sales']
    .apply(lambda x: x.tail(60).values)
    .to_dict()
)
print(f"  ✓ Dizionario creato: {len(last_values_dict)} serie")

del train, X_test, y_test, X_test_features
gc.collect()

# Genera forecasts
all_forecasts = []

for series_id in tqdm(series_order, desc="Forecasting"):
    eval_series = eval_data[eval_data['id'] == series_id].copy()
    
    if len(eval_series) == 0 or series_id not in last_values_dict:
        # Serie non vista: forecast = 0
        all_forecasts.append(np.zeros(28))
        continue
    
    # Prendi ultimi valori per lag
    last_vals = last_values_dict[series_id]
    
    # Calcola lag features per eval
    eval_series['lag_28'] = last_vals[-28] if len(last_vals) >= 28 else 0
    eval_series['lag_35'] = last_vals[-35] if len(last_vals) >= 35 else 0
    eval_series['lag_42'] = last_vals[-42] if len(last_vals) >= 42 else 0
    
    # Rolling features
    eval_series['roll_mean_7'] = np.mean(last_vals[-35:-28]) if len(last_vals) >= 35 else 0
    eval_series['roll_mean_28'] = np.mean(last_vals[-56:-28]) if len(last_vals) >= 56 else 0
    
    # Predict
    X_eval = eval_series[feature_cols].fillna(0)
    preds = model.predict(X_eval, num_iteration=model.best_iteration)
    preds = np.maximum(preds, 0)[:28]  # Solo primi 28 e non-negative
    
    # Pad se necessario
    if len(preds) < 28:
        preds = np.pad(preds, (0, 28 - len(preds)), constant_values=0)
    
    all_forecasts.append(preds)

# Converti in array (30490, 28)
forecast_array = np.array(all_forecasts)
print(f"\n✅ Forecast array: {forecast_array.shape}")

# Check NaN
nan_count = np.isnan(forecast_array).sum()
if nan_count > 0:
    print(f"⚠️  {nan_count} NaN trovati, riempimento con 0...")
    forecast_array = np.nan_to_num(forecast_array, nan=0.0)

# ============================================================================
# 7. CALCOLO WRMSSE
# ============================================================================

print("\n[6/6] Calcolo WRMSSE...")

wrmsse_score = wrmsse(forecast_array)

print(f"\n✅ WRMSSE: {wrmsse_score:.4f}")

# ============================================================================
# 8. SALVATAGGIO
# ============================================================================

print("\nSalvataggio risultati...")

model.save_model(str(OUTPUT_DIR / 'lightgbm_model.txt'))

forecast_df = pd.DataFrame(forecast_array, index=series_order)
forecast_df.to_pickle(OUTPUT_DIR / 'lightgbm_forecasts.pkl')

importance.to_csv(OUTPUT_DIR / 'feature_importance.csv', index=False)

summary = {
    'wrmsse': wrmsse_score,
    'best_iteration': model.best_iteration,
    'best_rmse': model.best_score['test']['rmse'],
    'n_series': len(series_order),
    'n_features': len(feature_cols),
}

with open(OUTPUT_DIR / 'lightgbm_summary.pkl', 'wb') as f:
    pickle.dump(summary, f)

LIGHTGBM
LightGBM: 4.6.0

[1/6] Caricamento dati...
✓ Train: (58327370, 11)

[2/6] Feature engineering
  Date features...
  Lag features...


    Lags: 100%|███████████████████████████████████| 3/3 [00:04<00:00,  1.61s/it]


  Rolling features...


    Rolling: 100%|████████████████████████████████| 2/2 [00:10<00:00,  5.10s/it]


  ID encoding...
✓ Features create: (58327370, 22)

[3/6] Preparazione train/test...
Train: (57473650, 22) (fino giorno 1884)
Test:  (853720, 22) (ultimi 28 giorni)

Features: 11
Dopo dropna - Train: (55796700, 22), Test: (853720, 22)

✅ X_train: (55796700, 11)
✅ X_test: (853720, 11)

[4/6] Training LightGBM...

🚀 Training...
Training until validation scores don't improve for 30 rounds
[50]	train's rmse: 2.65119	test's rmse: 2.36561
[100]	train's rmse: 2.53552	test's rmse: 2.23402
[150]	train's rmse: 2.51948	test's rmse: 2.21601
[200]	train's rmse: 2.50896	test's rmse: 2.21005
[250]	train's rmse: 2.49854	test's rmse: 2.20507
[300]	train's rmse: 2.48659	test's rmse: 2.20026
[350]	train's rmse: 2.47145	test's rmse: 2.1944
[400]	train's rmse: 2.46403	test's rmse: 2.19145
[450]	train's rmse: 2.45328	test's rmse: 2.18792
[500]	train's rmse: 2.44469	test's rmse: 2.18531
Did not meet early stopping. Best iteration is:
[500]	train's rmse: 2.44469	test's rmse: 2.18531

✅ Training completato!
  

Forecasting: 100%|████████████████████████| 30490/30490 [08:15<00:00, 61.56it/s]



✅ Forecast array: (30490, 28)

[6/6] Calcolo WRMSSE...

✅ WRMSSE: 0.8145

Salvataggio risultati...
