# 3. Modeling & Forecasting
This notebook fits baseline, Prophet, and XGBoost models, and compares their performance.

In [5]:
import pandas as pd, numpy as np
sales_full = pd.read_csv('../data/synthetic_sales.csv', parse_dates=['date'])
meta = pd.read_csv('../data/sku_metadata.csv')

In [6]:
# Baseline models
def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))
def seasonal_naive_forecast(series, horizon=30):
    last = series.tail(7).values
    reps = int(np.ceil(horizon/7))
    return np.tile(last, reps)[:horizon]
def moving_average_forecast(series, window=7, horizon=30):
    last_mean = series.tail(window).mean()
    return np.repeat(last_mean, horizon)

In [7]:
# Prophet model for all SKUs
try:
    from prophet import Prophet
    prophet_available = True
except Exception as e:
    print('Prophet not available:', e)
    prophet_available = False
results = []
if prophet_available:
    skus = list(sales_full['sku_id'].unique())
    for sku in skus:
        df = sales_full[sales_full['sku_id']==sku][['date','units_sold']].rename(columns={'date':'ds','units_sold':'y'})
        train = df.iloc[:-30]
        test = df.iloc[-30:]
        m = Prophet(weekly_seasonality=True, daily_seasonality=False)
        m.fit(train)
        future = m.make_future_dataframe(periods=30)
        fc = m.predict(future)
        y_pred = fc['yhat'].iloc[-30:].values
        y_true = test['y'].values
        results.append({'sku':sku, 'mape': float(np.round(mape(y_true,y_pred),3)), 'rmse': float(np.round(rmse(y_true,y_pred),3))})
        import joblib, os
        os.makedirs('../models/prophet_models', exist_ok=True)
        joblib.dump(m, f'../models/prophet_models/prophet_{sku}.joblib')
    import pandas as pd
    display(pd.DataFrame(results))
else:
    print('Prophet unavailable - skip this step. Use moving-average baseline instead.')

02:12:33 - cmdstanpy - INFO - Chain [1] start processing
02:12:33 - cmdstanpy - INFO - Chain [1] done processing
02:12:34 - cmdstanpy - INFO - Chain [1] start processing
02:12:34 - cmdstanpy - INFO - Chain [1] done processing
02:12:34 - cmdstanpy - INFO - Chain [1] start processing
02:12:34 - cmdstanpy - INFO - Chain [1] done processing
02:12:34 - cmdstanpy - INFO - Chain [1] start processing
02:12:34 - cmdstanpy - INFO - Chain [1] done processing
02:12:35 - cmdstanpy - INFO - Chain [1] start processing
02:12:35 - cmdstanpy - INFO - Chain [1] done processing
02:12:35 - cmdstanpy - INFO - Chain [1] start processing
02:12:35 - cmdstanpy - INFO - Chain [1] done processing
02:12:35 - cmdstanpy - INFO - Chain [1] start processing
02:12:35 - cmdstanpy - INFO - Chain [1] done processing
02:12:36 - cmdstanpy - INFO - Chain [1] start processing
02:12:36 - cmdstanpy - INFO - Chain [1] done processing
02:12:36 - cmdstanpy - INFO - Chain [1] start processing
02:12:36 - cmdstanpy - INFO - Chain [1]

Unnamed: 0,sku,mape,rmse
0,SKU_01,11.291,46.008
1,SKU_02,87.462,20.736
2,SKU_03,19.256,27.397
3,SKU_04,17.395,12.594
4,SKU_05,13.735,33.258
5,SKU_06,16.394,13.108
6,SKU_07,54.855,15.224
7,SKU_08,12.519,67.787
8,SKU_09,15.522,35.82
9,SKU_10,26.224,35.637


In [8]:
# XGBoost for all SKUs
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import joblib, os
skus = list(sales_full['sku_id'].unique())
xgb_results = []
for sku in skus:
    df = sales_full[sales_full['sku_id']==sku].sort_values('date').copy()
    df['lag_1'] = df['units_sold'].shift(1)
    df['lag_7'] = df['units_sold'].shift(7)
    df['lag_14'] = df['units_sold'].shift(14)
    df['lag_28'] = df['units_sold'].shift(28)
    df['rolling_mean_7'] = df['units_sold'].rolling(7).mean().shift(1)
    df['rolling_std_14'] = df['units_sold'].rolling(14).std().shift(1)
    df['day_of_week'] = df['date'].dt.weekday
    df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)
    df = df.dropna().reset_index(drop=True)
    features = ['lag_1','lag_7','lag_14','lag_28','rolling_mean_7','rolling_std_14','day_of_week','is_weekend','on_promo','price','views']
    target = 'units_sold'
    X = df[features]
    y = df[target]
    tscv = TimeSeriesSplit(n_splits=3)
    fold = 0
    maes = []
    for train_idx, test_idx in tscv.split(X):
        fold += 1
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model = xgb.XGBRegressor(n_estimators=200, max_depth=4, verbosity=0, random_state=42)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        mae = mean_absolute_error(y_test, preds)
        maes.append(mae)
        print(f'SKU {sku} Fold {fold} MAE: {mae:.3f}')
    final_model = xgb.XGBRegressor(n_estimators=300, max_depth=4, verbosity=0, random_state=42)
    final_model.fit(X, y)
    os.makedirs('../models', exist_ok=True)
    joblib.dump(final_model, f'../models/xgb_{sku}.joblib')
    print(f'Trained XGBoost for SKU: {sku}')
    print('Mean CV MAE:', np.mean(maes))
    xgb_results.append({'sku': sku, 'mean_cv_mae': float(np.mean(maes))})
import pandas as pd
display(pd.DataFrame(xgb_results))

SKU SKU_01 Fold 1 MAE: 30.389
SKU SKU_01 Fold 2 MAE: 26.962
SKU SKU_01 Fold 3 MAE: 30.746
Trained XGBoost for SKU: SKU_01
Mean CV MAE: 29.365767161051433
SKU SKU_02 Fold 1 MAE: 0.645
SKU SKU_02 Fold 2 MAE: 0.842
SKU SKU_02 Fold 3 MAE: 0.403
Trained XGBoost for SKU: SKU_02
Mean CV MAE: 0.6301538546880087
SKU SKU_03 Fold 1 MAE: 7.628
SKU SKU_03 Fold 2 MAE: 9.460
SKU SKU_03 Fold 3 MAE: 9.293
Trained XGBoost for SKU: SKU_03
Mean CV MAE: 8.793705304463705
SKU SKU_04 Fold 1 MAE: 6.928
SKU SKU_04 Fold 2 MAE: 6.383
SKU SKU_04 Fold 3 MAE: 6.406
Trained XGBoost for SKU: SKU_04
Mean CV MAE: 6.572510560353597
SKU SKU_05 Fold 1 MAE: 17.686
SKU SKU_05 Fold 2 MAE: 17.140
SKU SKU_05 Fold 3 MAE: 14.548
Trained XGBoost for SKU: SKU_05
Mean CV MAE: 16.458025296529133
SKU SKU_06 Fold 1 MAE: 7.642
SKU SKU_06 Fold 2 MAE: 8.033
SKU SKU_06 Fold 3 MAE: 9.547
Trained XGBoost for SKU: SKU_06
Mean CV MAE: 8.407290935516357
SKU SKU_07 Fold 1 MAE: 0.516
SKU SKU_07 Fold 2 MAE: 0.741
SKU SKU_07 Fold 3 MAE: 0.729
Trai

Unnamed: 0,sku,mean_cv_mae
0,SKU_01,29.365767
1,SKU_02,0.630154
2,SKU_03,8.793705
3,SKU_04,6.572511
4,SKU_05,16.458025
5,SKU_06,8.407291
6,SKU_07,0.661972
7,SKU_08,26.05748
8,SKU_09,18.812641
9,SKU_10,11.725182
