# 3. Modeling & Forecasting
This notebook fits baseline, Prophet, and XGBoost models, and compares their performance.

In [1]:
import pandas as pd, numpy as np
sales_full = pd.read_csv('../data/synthetic_sales.csv', parse_dates=['date'])
meta = pd.read_csv('../data/sku_metadata.csv')

In [2]:
# Baseline models
def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))
def seasonal_naive_forecast(series, horizon=30):
    last = series.tail(7).values
    reps = int(np.ceil(horizon/7))
    return np.tile(last, reps)[:horizon]
def moving_average_forecast(series, window=7, horizon=30):
    last_mean = series.tail(window).mean()
    return np.repeat(last_mean, horizon)

In [3]:
# Prophet model for all SKUs
try:
    from prophet import Prophet
    prophet_available = True
except Exception as e:
    print('Prophet not available:', e)
    prophet_available = False
results = []
if prophet_available:
    skus = list(sales_full['sku_id'].unique())
    for sku in skus:
        df = sales_full[sales_full['sku_id']==sku][['date','units_sold']].rename(columns={'date':'ds','units_sold':'y'})
        train = df.iloc[:-30]
        test = df.iloc[-30:]
        m = Prophet(weekly_seasonality=True, daily_seasonality=False)
        m.fit(train)
        future = m.make_future_dataframe(periods=30)
        fc = m.predict(future)
        y_pred = fc['yhat'].iloc[-30:].values
        y_true = test['y'].values
        results.append({'sku':sku, 'mape': float(np.round(mape(y_true,y_pred),3)), 'rmse': float(np.round(rmse(y_true,y_pred),3))})
        import joblib, os
        os.makedirs('../models/prophet_models', exist_ok=True)
        joblib.dump(m, f'../models/prophet_models/prophet_{sku}.joblib')
    import pandas as pd
    display(pd.DataFrame(results))
else:
    print('Prophet unavailable - skip this step. Use moving-average baseline instead.')

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
00:47:21 - cmdstanpy - INFO - Chain [1] start processing
00:47:21 - cmdstanpy - INFO - Chain [1] start processing
00:47:21 - cmdstanpy - INFO - Chain [1] done processing
00:47:21 - cmdstanpy - INFO - Chain [1] done processing
00:47:22 - cmdstanpy - INFO - Chain [1] start processing
00:47:22 - cmdstanpy - INFO - Chain [1] start processing
00:47:22 - cmdstanpy - INFO - Chain [1] done processing
00:47:22 - cmdstanpy - INFO - Chain [1] done processing
00:47:22 - cmdstanpy - INFO - Chain [1] start processing
00:47:22 - cmdstanpy - INFO - Chain [1] start processing
00:47:22 - cmdstanpy - INFO - Chain [1] done processing
00:47:22 - cmdstanpy - INFO - Chain [1] done processing
00:47:22 - cmdstanpy - INFO - Chain [1] start processing
00:47:22 - cmdstanpy - INFO - Chain [1] start processing
00:47:22 - cmdstanpy - INFO - Chain [1] do

Unnamed: 0,sku,mape,rmse
0,SKU_01,14.519,22.06
1,SKU_02,256.372,14.522
2,SKU_03,27.251,3.626
3,SKU_04,12.031,52.866
4,SKU_05,17.267,27.312
5,SKU_06,14.482,10.023
6,SKU_07,177.237,12.332
7,SKU_08,16.388,30.781
8,SKU_09,10.779,20.475
9,SKU_10,13.406,32.042


In [4]:
# XGBoost for all SKUs
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import joblib, os
skus = list(sales_full['sku_id'].unique())
xgb_results = []
for sku in skus:
    df = sales_full[sales_full['sku_id']==sku].sort_values('date').copy()
    df['lag_1'] = df['units_sold'].shift(1)
    df['lag_7'] = df['units_sold'].shift(7)
    df['lag_14'] = df['units_sold'].shift(14)
    df['lag_28'] = df['units_sold'].shift(28)
    df['rolling_mean_7'] = df['units_sold'].rolling(7).mean().shift(1)
    df['rolling_std_14'] = df['units_sold'].rolling(14).std().shift(1)
    df['day_of_week'] = df['date'].dt.weekday
    df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)
    df = df.dropna().reset_index(drop=True)
    features = ['lag_1','lag_7','lag_14','lag_28','rolling_mean_7','rolling_std_14','day_of_week','is_weekend','on_promo','price','views']
    target = 'units_sold'
    X = df[features]
    y = df[target]
    tscv = TimeSeriesSplit(n_splits=3)
    fold = 0
    maes = []
    for train_idx, test_idx in tscv.split(X):
        fold += 1
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model = xgb.XGBRegressor(n_estimators=200, max_depth=4, verbosity=0, random_state=42)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        mae = mean_absolute_error(y_test, preds)
        maes.append(mae)
        print(f'SKU {sku} Fold {fold} MAE: {mae:.3f}')
    final_model = xgb.XGBRegressor(n_estimators=300, max_depth=4, verbosity=0, random_state=42)
    final_model.fit(X, y)
    os.makedirs('../models', exist_ok=True)
    joblib.dump(final_model, f'../models/xgb_{sku}.joblib')
    print(f'Trained XGBoost for SKU: {sku}')
    print('Mean CV MAE:', np.mean(maes))
    xgb_results.append({'sku': sku, 'mean_cv_mae': float(np.mean(maes))})
import pandas as pd
display(pd.DataFrame(xgb_results))

SKU SKU_01 Fold 1 MAE: 14.488
SKU SKU_01 Fold 2 MAE: 10.526
SKU SKU_01 Fold 3 MAE: 9.903
Trained XGBoost for SKU: SKU_01
Mean CV MAE: 11.639213879903158
SKU SKU_01 Fold 3 MAE: 9.903
Trained XGBoost for SKU: SKU_01
Mean CV MAE: 11.639213879903158
SKU SKU_02 Fold 1 MAE: 2.211
SKU SKU_02 Fold 2 MAE: 1.278
SKU SKU_02 Fold 3 MAE: 0.420
SKU SKU_02 Fold 1 MAE: 2.211
SKU SKU_02 Fold 2 MAE: 1.278
SKU SKU_02 Fold 3 MAE: 0.420
Trained XGBoost for SKU: SKU_02
Mean CV MAE: 1.3028956751028697
SKU SKU_03 Fold 1 MAE: 1.962
Trained XGBoost for SKU: SKU_02
Mean CV MAE: 1.3028956751028697
SKU SKU_03 Fold 1 MAE: 1.962
SKU SKU_03 Fold 2 MAE: 2.098
SKU SKU_03 Fold 3 MAE: 2.450
SKU SKU_03 Fold 2 MAE: 2.098
SKU SKU_03 Fold 3 MAE: 2.450
Trained XGBoost for SKU: SKU_03
Mean CV MAE: 2.170098145802816
SKU SKU_04 Fold 1 MAE: 12.815
SKU SKU_04 Fold 2 MAE: 18.966
Trained XGBoost for SKU: SKU_03
Mean CV MAE: 2.170098145802816
SKU SKU_04 Fold 1 MAE: 12.815
SKU SKU_04 Fold 2 MAE: 18.966
SKU SKU_04 Fold 3 MAE: 16.919
Tr

Unnamed: 0,sku,mean_cv_mae
0,SKU_01,11.639214
1,SKU_02,1.302896
2,SKU_03,2.170098
3,SKU_04,16.233125
4,SKU_05,11.633885
5,SKU_06,8.572725
6,SKU_07,1.805197
7,SKU_08,13.146588
8,SKU_09,13.214417
9,SKU_10,17.90243
