<a href="https://colab.research.google.com/github/Keerthanabs1326/Unlox_project/blob/main/unloxproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install pandas numpy matplotlib seaborn scikit-learn statsmodels prophet lightgbm xgboost plotly pyarrow
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
import plotly.express as px

In [2]:
DATA_PATH = '/content/drive/MyDrive/Unlox/Retail_Sales_Data_Unlox (1).csv'

df = pd.read_csv(DATA_PATH)
df.columns = [c.strip().lower() for c in df.columns]
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')
df.head()

Unnamed: 0,date,store_id,store_location,product_id,product_category,product_subcategory,brand,unit_price,units_sold,total_sales,discount_percentage,revenue,customer_type,payment_mode,promotion_applied,stock_on_hand,store_rating,region,holiday_flag
14476,2023-01-01,STR_102,Bangalore,PRD_156,Sports,Gym Equipment,Reebok,37973.56,9,341762.04,10,307585.836,New,Credit Card,Yes,247,4.3,South,0
6170,2023-01-01,STR_103,Delhi,PRD_310,Electronics,Laptops,Apple,14336.46,28,401420.88,15,341207.748,New,Cash,No,390,4.9,North,0
20217,2023-01-01,STR_107,Kolkata,PRD_859,Fashion,Women Clothing,Levis,22772.02,22,500984.44,10,450885.996,Returning,UPI,No,279,4.8,East,0
13569,2023-01-01,STR_103,Delhi,PRD_330,Electronics,Laptops,Sony,45314.07,2,90628.14,10,81565.326,Returning,UPI,No,395,4.0,North,0
13752,2023-01-01,STR_105,Pune,PRD_426,Home Appliances,Kitchen,Whirlpool,39077.73,5,195388.65,5,185619.2175,Returning,UPI,Yes,229,4.3,West,0


In [8]:
print(df.dtypes)
print("Rows:", len(df))
print("Date range:", df['date'].min(), "→", df['date'].max())
print("Missing values:\n", df.isna().sum())
print("Unique stores:", df['store_id'].nunique(), "Unique categories:", df['product_category'].nunique())

date                   datetime64[ns]
store_id                       object
store_location                 object
product_id                     object
product_category               object
product_subcategory            object
brand                          object
unit_price                    float64
units_sold                      int64
total_sales                   float64
discount_percentage             int64
revenue                       float64
customer_type                  object
payment_mode                   object
promotion_applied              object
stock_on_hand                   int64
store_rating                  float64
region                         object
holiday_flag                    int64
dtype: object
Rows: 24319
Date range: 2023-01-01 00:00:00 → 2024-12-31 00:00:00
Missing values:
 date                   0
store_id               0
store_location         0
product_id             0
product_category       0
product_subcategory    0
brand                  0
unit_p

In [12]:
daily = (df.groupby(['date','store_id','product_category'], as_index=False)
           .agg({'units_sold':'sum','revenue':'sum','unit_price':'mean'}))

sid = daily['store_id'].sample(1).iloc[0]
cat = daily[daily['store_id']==sid]['product_category'].sample(1).iloc[0]
sample = daily[(daily.store_id==sid) & (daily.product_category==cat)]

fig = px.line(sample, x='date', y='revenue', title=f"Revenue trend | Store {sid} · {cat}")
fig.show()

In [13]:
df = df.drop_duplicates()
if 'revenue' not in df.columns and {'units_sold','unit_price'}.issubset(df.columns):
    df['revenue'] = df['units_sold'] * df['price']
for col in ['units_sold','revenue']:
    if col in df.columns:
        df.loc[df[col] < 0, col] = np.nan
        df[col] = df[col].interpolate(limit_direction='both')
daily = (df.groupby(['date','store_id','product_category'], as_index=False)
           .agg({'units_sold':'sum','revenue':'sum','unit_price':'mean'}))
daily.head()

Unnamed: 0,date,store_id,product_category,units_sold,revenue,unit_price
0,2023-01-01,STR_101,Electronics,86.0,3144245.198,40053.975
1,2023-01-01,STR_101,Home Appliances,37.0,1530093.449,48651.62
2,2023-01-01,STR_101,Sports,21.0,608158.488,36199.91
3,2023-01-01,STR_102,Electronics,75.0,2140210.082,30114.555
4,2023-01-01,STR_102,Sports,50.0,659383.548,24349.55


In [16]:
d = daily.copy()
d['dow'] = d['date'].dt.dayofweek
d['week'] = d['date'].dt.isocalendar().week.astype(int)
d['month'] = d['date'].dt.month
d['quarter'] = d['date'].dt.quarter
d['year'] = d['date'].dt.year
d['is_weekend'] = d['dow'].isin([5,6]).astype(int)

d = d.sort_values(['store_id','product_category','date'])
for lag in [1,7,28]:
    d[f'revenue_lag_{lag}'] = d.groupby(['store_id','product_category'])['revenue'].shift(lag)
    d[f'units_lag_{lag}'] = d.groupby(['store_id','product_category'])['units_sold'].shift(lag)
    d[f'price_lag_{lag}'] = d.groupby(['store_id','product_category'])['unit_price'].shift(lag)

for win in [7,28]:
    d[f'revenue_roll_{win}'] = (d.groupby(['store_id','product_category'])['revenue']
                                 .rolling(win).mean().reset_index(level=[0,1], drop=True))
    d[f'units_roll_{win}'] = (d.groupby(['store_id','product_category'])['units_sold']
                               .rolling(win).mean().reset_index(level=[0,1], drop=True))

d = d.dropna(subset=['revenue_lag_1','units_lag_1']).reset_index(drop=True)
d.head()

Unnamed: 0,date,store_id,product_category,units_sold,revenue,unit_price,dow,week,month,quarter,...,revenue_lag_7,units_lag_7,price_lag_7,revenue_lag_28,units_lag_28,price_lag_28,revenue_roll_7,units_roll_7,revenue_roll_28,units_roll_28
0,2023-01-02,STR_101,Electronics,46.0,2136844.9,46453.15,0,1,1,1,...,,,,,,,,,,
1,2023-01-03,STR_101,Electronics,17.0,17598.519,1150.23,1,1,1,1,...,,,,,,,,,,
2,2023-01-04,STR_101,Electronics,36.0,1517781.24,42160.59,2,1,1,1,...,,,,,,,,,,
3,2023-01-06,STR_101,Electronics,20.0,492309.46,28959.38,4,1,1,1,...,,,,,,,,,,
4,2023-01-10,STR_101,Electronics,35.0,1421769.16,47790.56,1,2,1,1,...,,,,,,,,,,


In [17]:
CUTOFF = pd.to_datetime('2024-10-01')
train = d[d['date'] <= CUTOFF].copy()
val   = d[d['date'] >  CUTOFF].copy()
len(train), len(val), train['date'].max(), val['date'].min()

(14353,
 2050,
 Timestamp('2024-10-01 00:00:00'),
 Timestamp('2024-10-02 00:00:00'))

In [18]:
import numpy as np

def last_value_naive(y, horizon):
    return np.repeat(y.iloc[-1], horizon)

def seasonal_naive(y, horizon, season=7):
    tail = y.iloc[-season:].values
    reps = (horizon // season) + 1
    return np.tile(tail, reps)[:horizon]

In [19]:
from prophet import Prophet

def fit_prophet_series(df_sc):
    m = Prophet(seasonality_mode='multiplicative', weekly_seasonality=True, yearly_seasonality=True)
    ts = df_sc[['date','revenue']].rename(columns={'date':'ds','revenue':'y'})
    m.fit(ts)
    return m

def forecast_prophet_model(m, start_date, horizon):
    future = pd.date_range(start_date + pd.Timedelta(days=1), periods=horizon, freq='D')
    fc = m.predict(pd.DataFrame({'ds': future}))
    return pd.DataFrame({'date': future, 'yhat': fc['yhat']})

In [25]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

def fit_sarimax_series(y):
    # Ensure the series has a DatetimeIndex for SARIMAX
    if not isinstance(y.index, pd.DatetimeIndex):
        # This part of the function assumes y is a Series extracted from a DataFrame
        # that had a 'date' column. It's safer to ensure the index is set before calling this function
        # However, if we must handle it here, we'd need the date column which is not directly available in 'y'
        # For simplicity, we'll assume y_tr will be passed with a DatetimeIndex in the calling code.
        pass

    res = SARIMAX(y, order=(1,1,1), seasonal_order=(1,1,1,7),
                  enforce_stationarity=False, enforce_invertibility=False).fit(disp=False)
    return res

In [57]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

FEATURES = [c for c in d.columns if c not in ['date','revenue','units_sold','store_id','product_category','product_id']]

def train_lgbm(train_df, val_df):
    Xtr, ytr = train_df[FEATURES], train_df['revenue']
    Xva, yva = val_df[FEATURES], val_df['revenue']
    params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": 0.05,
        "num_leaves": 31,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 5
    }
    tr = lgb.Dataset(Xtr, label=ytr)
    va = lgb.Dataset(Xva, label=yva)
    model = lgb.train(params, tr, num_boost_round=1000, valid_sets=[va],
                      callbacks=[lgb.early_stopping(stopping_rounds=50)])
    preds = model.predict(Xva)
    rmse = np.sqrt(mean_squared_error(yva, preds))
    mape = mean_absolute_percentage_error(yva, preds)
    return model, rmse, mape

In [58]:
results = []
best_models = {}  # (store, category) → {'type': 'lgbm'/'prophet'/'sarimax', 'model': object}

for (sid, cat), group in d.groupby(['store_id','product_category']):
    if len(group[group['date'] <= CUTOFF]) < 120 or len(group[group['date'] > CUTOFF]) < 14:
        continue

    tr = group[group['date'] <= CUTOFF]
    va = group[group['date'] >  CUTOFF]
    y_tr, y_va = tr['revenue'], va['revenue']

    # Prophet
    try:
        m_prophet = fit_prophet_series(group[['date','revenue']])
        fc_prophet = forecast_prophet_model(m_prophet, tr['date'].max(), len(va))['yhat'].values
        rmse_prophet = np.sqrt(((y_va.values - fc_prophet)**2).mean())
    except Exception as e:
        rmse_prophet, m_prophet = np.inf, None

    # SARIMAX
    try:
        sarimax_res = fit_sarimax_series(y_tr)
        fc_sarimax = sarimax_res.get_forecast(steps=len(va)).predicted_mean.values
        rmse_sarimax = np.sqrt(((y_va.values - fc_sarimax)**2).mean())
    except Exception as e:
        rmse_sarimax, sarimax_res = np.inf, None

    # LightGBM
    try:
        m_lgbm, rmse_lgbm, mape_lgbm = train_lgbm(tr, va)
    except Exception as e:
        rmse_lgbm, m_lgbm, mape_lgbm = np.inf, None, None

    # Select best
    scores = {'prophet': rmse_prophet, 'sarimax': rmse_sarimax, 'lgbm': rmse_lgbm}
    best_type = min(scores, key=scores.get)
    best_model = {'type': best_type, 'model': {'prophet': m_prophet, 'sarimax': sarimax_res, 'lgbm': m_lgbm}[best_type]}
    best_models[(sid, cat)] = best_model

    results.append({
        'store_id': sid, 'category': cat,
        'rmse_prophet': rmse_prophet,
        'rmse_sarimax': rmse_sarimax,
        'rmse_lgbm': rmse_lgbm,
        'best_type': best_type
    })

res_df = pd.DataFrame(results).sort_values('rmse_lgbm')
res_df.head(10)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001682 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1451
[LightGBM] [Info] Number of data points in the train set: 334, number of used features: 20
[LightGBM] [Info] Start training from score 837467.326522
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[25]	valid_0's rmse: 697934



An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1535
[LightGBM] [Info] Number of data points in the train set: 358, number of used features: 20
[LightGBM] [Info] Start training from score 847982.276554
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[56]	valid_0's rmse: 514340



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1501
[LightGBM] [Info] Number of data points in the train set: 348, number of used features: 20
[LightGBM] [Info] Start training from score 791320.952183
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[34]	valid_0's rmse: 492447



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1626
[LightGBM] [Info] Number of data points in the train set: 384, number of used features: 20
[LightGBM] [Info] Start training from score 843730.498617
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[46]	valid_0's rmse: 634745



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1569
[LightGBM] [Info] Number of data points in the train set: 366, number of used features: 20
[LightGBM] [Info] Start training from score 861311.297938
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[69]	valid_0's rmse: 702886



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1639
[LightGBM] [Info] Number of data points in the train set: 389, number of used features: 20
[LightGBM] [Info] Start training from score 787886.772255
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[98]	valid_0's rmse: 778373



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1443
[LightGBM] [Info] Number of data points in the train set: 334, number of used features: 20
[LightGBM] [Info] Start training from score 888244.419002
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[70]	valid_0's rmse: 595462



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1575
[LightGBM] [Info] Number of data points in the train set: 368, number of used features: 20
[LightGBM] [Info] Start training from score 828173.709457
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 723500



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1481
[LightGBM] [Info] Number of data points in the train set: 344, number of used features: 20
[LightGBM] [Info] Start training from score 796676.898957
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	valid_0's rmse: 550717



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014684 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1534
[LightGBM] [Info] Number of data points in the train set: 361, number of used features: 20
[LightGBM] [Info] Start training from score 812273.650531
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[22]	valid_0's rmse: 626533



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1653
[LightGBM] [Info] Number of data points in the train set: 389, number of used features: 20
[LightGBM] [Info] Start training from score 856167.280582
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 501202



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001973 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1630
[LightGBM] [Info] Number of data points in the train set: 382, number of used features: 20
[LightGBM] [Info] Start training from score 829031.865355
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[33]	valid_0's rmse: 468534



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1504
[LightGBM] [Info] Number of data points in the train set: 350, number of used features: 20
[LightGBM] [Info] Start training from score 870608.885876
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	valid_0's rmse: 519453



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 361, number of used features: 20
[LightGBM] [Info] Start training from score 840860.560024
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[29]	valid_0's rmse: 420926



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1498
[LightGBM] [Info] Number of data points in the train set: 349, number of used features: 20
[LightGBM] [Info] Start training from score 792097.011670
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[38]	valid_0's rmse: 587374



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014629 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1575
[LightGBM] [Info] Number of data points in the train set: 368, number of used features: 20
[LightGBM] [Info] Start training from score 885936.479652
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[42]	valid_0's rmse: 629036



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005016 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1479
[LightGBM] [Info] Number of data points in the train set: 341, number of used features: 20
[LightGBM] [Info] Start training from score 853389.300784
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[39]	valid_0's rmse: 653884



An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1609
[LightGBM] [Info] Number of data points in the train set: 375, number of used features: 20
[LightGBM] [Info] Start training from score 808689.826770
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 522277



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1607
[LightGBM] [Info] Number of data points in the train set: 377, number of used features: 20
[LightGBM] [Info] Start training from score 852443.003500
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[54]	valid_0's rmse: 466416



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1528
[LightGBM] [Info] Number of data points in the train set: 359, number of used features: 20
[LightGBM] [Info] Start training from score 823606.889086
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 465444



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1531
[LightGBM] [Info] Number of data points in the train set: 358, number of used features: 20
[LightGBM] [Info] Start training from score 883925.579225
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[30]	valid_0's rmse: 694213



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1560
[LightGBM] [Info] Number of data points in the train set: 362, number of used features: 20
[LightGBM] [Info] Start training from score 869467.199396
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[37]	valid_0's rmse: 468896



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1527
[LightGBM] [Info] Number of data points in the train set: 358, number of used features: 20
[LightGBM] [Info] Start training from score 806058.482453
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[69]	valid_0's rmse: 673156



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1443
[LightGBM] [Info] Number of data points in the train set: 336, number of used features: 20
[LightGBM] [Info] Start training from score 827663.151682
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[19]	valid_0's rmse: 474955



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003852 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1553
[LightGBM] [Info] Number of data points in the train set: 362, number of used features: 20
[LightGBM] [Info] Start training from score 804825.438862
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[34]	valid_0's rmse: 488915



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1477
[LightGBM] [Info] Number of data points in the train set: 341, number of used features: 20
[LightGBM] [Info] Start training from score 850024.691480
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[30]	valid_0's rmse: 484337



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1568
[LightGBM] [Info] Number of data points in the train set: 368, number of used features: 20
[LightGBM] [Info] Start training from score 905616.725951
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[23]	valid_0's rmse: 475736



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1548
[LightGBM] [Info] Number of data points in the train set: 363, number of used features: 20
[LightGBM] [Info] Start training from score 848631.063304
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[37]	valid_0's rmse: 815858



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1537
[LightGBM] [Info] Number of data points in the train set: 360, number of used features: 20
[LightGBM] [Info] Start training from score 787699.689265
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[54]	valid_0's rmse: 474971



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1590
[LightGBM] [Info] Number of data points in the train set: 373, number of used features: 20
[LightGBM] [Info] Start training from score 823953.884143
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[45]	valid_0's rmse: 570856



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1544
[LightGBM] [Info] Number of data points in the train set: 361, number of used features: 20
[LightGBM] [Info] Start training from score 845017.500315
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[45]	valid_0's rmse: 695762



An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000998 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1627
[LightGBM] [Info] Number of data points in the train set: 385, number of used features: 20
[LightGBM] [Info] Start training from score 786083.296290
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[63]	valid_0's rmse: 410961



An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1487
[LightGBM] [Info] Number of data points in the train set: 343, number of used features: 20
[LightGBM] [Info] Start training from score 835082.056780
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[49]	valid_0's rmse: 552280



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1525
[LightGBM] [Info] Number of data points in the train set: 359, number of used features: 20
[LightGBM] [Info] Start training from score 829203.940413
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 609436



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1518
[LightGBM] [Info] Number of data points in the train set: 354, number of used features: 20
[LightGBM] [Info] Start training from score 845065.247466
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[24]	valid_0's rmse: 740755



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1485
[LightGBM] [Info] Number of data points in the train set: 341, number of used features: 20
[LightGBM] [Info] Start training from score 840151.029971
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[85]	valid_0's rmse: 470521



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1507
[LightGBM] [Info] Number of data points in the train set: 351, number of used features: 20
[LightGBM] [Info] Start training from score 797679.880573
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[64]	valid_0's rmse: 627278



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1452
[LightGBM] [Info] Number of data points in the train set: 337, number of used features: 20
[LightGBM] [Info] Start training from score 775659.537605
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[49]	valid_0's rmse: 569214



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1484
[LightGBM] [Info] Number of data points in the train set: 346, number of used features: 20
[LightGBM] [Info] Start training from score 869993.088900
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	valid_0's rmse: 779851
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1521
[LightGBM] [Info] Number of data points in the train set: 358, number of used features: 20
[LightGBM] [Info] Start training from score 881538.493626
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	valid_0's rmse: 603461



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



Unnamed: 0,store_id,category,rmse_prophet,rmse_sarimax,rmse_lgbm,best_type
31,STR_107,Fashion,inf,649035.332931,410960.574847,lgbm
13,STR_103,Home Appliances,inf,714899.613252,420925.538968,lgbm
19,STR_104,Sports,inf,721658.782246,465443.975935,lgbm
18,STR_104,Home Appliances,inf,634167.268753,466416.229899,lgbm
11,STR_103,Fashion,inf,802368.072013,468533.835025,lgbm
21,STR_105,Fashion,inf,664304.662403,468895.866236,lgbm
35,STR_108,Electronics,inf,738561.561865,470521.121401,lgbm
23,STR_105,Home Appliances,inf,575100.516383,474954.539128,lgbm
28,STR_106,Home Appliances,inf,712039.997866,474971.410684,lgbm
26,STR_106,Fashion,inf,705925.021159,475735.605081,lgbm


In [60]:
HORIZON = 30
fc_all = []

for (sid, cat), bm in best_models.items():
    group = d[(d.store_id==sid) & (d.product_category==cat)].sort_values('date')
    last_date = group['date'].max()

    if bm['type'] == 'prophet':
        fc = forecast_prophet_model(bm['model'], last_date, HORIZON)
        fc['store_id'] = sid; fc['category'] = cat

    elif bm['type'] == 'sarimax':
        steps = bm['model'].get_forecast(steps=HORIZON).predicted_mean
        future_dates = pd.date_range(last_date + pd.Timedelta(days=1), periods=HORIZON)
        fc = pd.DataFrame({'date': future_dates, 'yhat': steps})
        fc['store_id'] = sid; fc['category'] = cat

    else:  # LGBM: use a simple feature extrapolation (approximation)
        last_row = group.iloc[-1:].copy()
        future = pd.concat([last_row]*HORIZON, ignore_index=True)
        future['date'] = pd.date_range(last_date + pd.Timedelta(days=1), periods=HORIZON)
        # IMPORTANT: For true lags, append future iteratively and recompute rolling features.
        preds = bm['model']['booster'].predict(future[FEATURES]) if isinstance(bm['model'], dict) else bm['model'].predict(future[FEATURES])
        fc = pd.DataFrame({'date': future['date'], 'yhat': preds, 'store_id': sid, 'category': cat})

    fc_all.append(fc)

fc_all = pd.concat(fc_all, ignore_index=True)
fc_all.head()

Unnamed: 0,date,yhat,store_id,category
0,2025-01-01,774489.339324,STR_101,Electronics
1,2025-01-02,774489.339324,STR_101,Electronics
2,2025-01-03,774489.339324,STR_101,Electronics
3,2025-01-04,774489.339324,STR_101,Electronics
4,2025-01-05,774489.339324,STR_101,Electronics


In [61]:
fc_store = (fc_all.groupby(['date','store_id'], as_index=False)
            .agg({'yhat':'sum'}).rename(columns={'yhat':'yhat_store'}))

fc_total = (fc_store.groupby('date', as_index=False)
            .agg({'yhat_store':'sum'}).rename(columns={'yhat_store':'yhat_total'}))

fc_store_total = fc_store.merge(fc_total, on='date')
fc_store_total.head()

Unnamed: 0,date,store_id,yhat_store,yhat_total
0,2024-12-27,STR_106,279176.8,1361182.0
1,2024-12-27,STR_107,1082005.0,1361182.0
2,2024-12-28,STR_106,279176.8,1361182.0
3,2024-12-28,STR_107,1082005.0,1361182.0
4,2024-12-29,STR_102,884077.5,4530111.0


In [62]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

agg = (d.groupby('store_id')
       .agg(total_revenue=('revenue','sum'),
            avg_ticket=('revenue','mean'),
            revenue_volatility=('revenue','std'),
            units_mean=('units_sold','mean'),
            weekend_share=('is_weekend','mean'))
       .reset_index()).fillna(0)

num_cols = [c for c in agg.columns if c!='store_id']
Xs = StandardScaler().fit_transform(agg[num_cols])
km = KMeans(n_clusters=5, n_init='auto', random_state=42).fit(Xs)
clusters = pd.DataFrame({'store_id': agg['store_id'], 'cluster': km.labels_})
clusters.head()

Unnamed: 0,store_id,cluster
0,STR_101,2
1,STR_102,4
2,STR_103,0
3,STR_104,1
4,STR_105,1


In [63]:
sid = fc_all['store_id'].sample(1).iloc[0]
cat = fc_all[fc_all['store_id']==sid]['category'].sample(1).iloc[0]
hist = d[(d.store_id==sid) & (d.product_category==cat)][['date','revenue']]
future = fc_all[(fc_all.store_id==sid) & (fc_all.category==cat)][['date','yhat']]

fig = px.line(hist, x='date', y='revenue', title=f"History & forecast | Store {sid} · {cat}")
fig.add_scatter(x=future['date'], y=future['yhat'], name='Forecast')
fig.show()

# Cluster counts
cluster_counts = clusters['cluster'].value_counts().reset_index()
cluster_counts.columns = ['cluster','count']
px.bar(cluster_counts, x='cluster', y='count', title='Store clusters').show()

In [64]:
OUT_DIR = '/content/drive/MyDrive/retail_artifacts'
import os
os.makedirs(OUT_DIR, exist_ok=True)

d.to_parquet(f"{OUT_DIR}/daily.parquet")
res_df.to_csv(f"{OUT_DIR}/model_scores.csv", index=False)
fc_all.to_parquet(f"{OUT_DIR}/forecasts.parquet")
clusters.to_csv(f"{OUT_DIR}/store_clusters.csv", index=False)

In [65]:
# Install dependencies
!pip -q install pandas numpy matplotlib seaborn scikit-learn statsmodels prophet lightgbm xgboost plotly pyarrow

import pandas as pd, numpy as np
import plotly.express as px

# Load CSV (adjust the path)
DATA_PATH = '/content/drive/MyDrive/Unlox/Retail_Sales_Data_Unlox (1).csv'
df = pd.read_csv(DATA_PATH, sep=None, engine='python')

# Normalize column names (replace spaces with underscores)
df.columns = [c.strip().replace(' ', '_') for c in df.columns]

# Map to canonical names
df = df.rename(columns={
    'Date': 'date',
    'Store_ID': 'store_id',
    'Store_Location': 'store_location',
    'Product_ID': 'product_id',
    'Product_Category': 'category',
    'Product_Subcategory': 'subcategory',
    'Unit_Price': 'unit_price',
    'Units_Sold': 'units_sold',
    'Total_Sales': 'total_sales',
    'Discount_Percentage': 'discount_pct',
    'Revenue': 'revenue',
    'Customer_Type': 'customer_type',
    'Payment_Mode': 'payment_mode',
    'Promotion_Applied': 'promotion_applied',
    'Stock_On_Hand': 'stock_on_hand',
    'Store_Rating': 'store_rating',
    'Region': 'region',
    'Holiday_Flag': 'holiday_flag'
})

# Types and cleaning
df['date'] = pd.to_datetime(df['date'], errors='coerce')
num_cols = ['unit_price','units_sold','total_sales','discount_pct','revenue','stock_on_hand','store_rating','holiday_flag']
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Sanity: derive revenue if missing (Total_Sales * (1 - discount_pct/100))
if 'revenue' in df.columns and df['revenue'].isna().any():
    mask = df['revenue'].isna() & df['total_sales'].notna() & df['discount_pct'].notna()
    df.loc[mask, 'revenue'] = df.loc[mask, 'total_sales'] * (1 - df.loc[mask, 'discount_pct']/100.0)

# Basic filtering
df = df.dropna(subset=['date','store_id','category','units_sold','unit_price']).sort_values('date')
df['promotion_applied_flag'] = (df['promotion_applied'].astype(str).str.lower() == 'yes').astype(int)

# Daily store-category rollup
daily = (df.groupby(['date','store_id','category'], as_index=False)
           .agg({
               'units_sold':'sum',
               'revenue':'sum',
               'unit_price':'mean',
               'holiday_flag':'max',
               'promotion_applied_flag':'max'
           }))

# Quick plot to verify
sid = daily['store_id'].sample(1).iloc[0]
cat = daily[daily['store_id']==sid]['category'].sample(1).iloc[0]
px.line(daily[(daily.store_id==sid)&(daily.category==cat)], x='date', y='revenue',
        title=f'Revenue trend | {sid} · {cat}').show()

In [66]:
# SARIMAX expanding-window CV and grid search
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

def sarimax_expanding_cv(y, order, seasonal_order, folds=3):
    errors = []
    n = len(y)
    split_size = n // (folds + 1)
    for i in range(1, folds+1):
        train = y[:i*split_size]
        test   = y[i*split_size:(i+1)*split_size]
        try:
            res = SARIMAX(train, order=order, seasonal_order=seasonal_order,
                          enforce_stationarity=False, enforce_invertibility=False).fit(disp=False)
            pred = res.get_forecast(steps=len(test)).predicted_mean
            rmse = np.sqrt(mean_squared_error(test, pred))
        except Exception:
            rmse = np.inf
        errors.append(rmse)
    return float(np.mean(errors))

def sarimax_grid_search(y, orders, seasonal_orders, folds=3):
    best = {'rmse': np.inf, 'order': None, 'seasonal_order': None}
    for order in orders:
        for sorder in seasonal_orders:
            rmse = sarimax_expanding_cv(y, order, sorder, folds=folds)
            if rmse < best['rmse']:
                best = {'rmse': rmse, 'order': order, 'seasonal_order': sorder}
    return best

# Example usage for a single store-category series
sid, cat = d[['store_id','product_category']].drop_duplicates().iloc[0]
series = d[(d.store_id==sid) & (d.product_category==cat)].sort_values('date')['revenue']
orders = [(1,1,1), (2,1,2), (0,1,1)]
seasonal_orders = [(1,1,1,7), (0,1,1,7)]
best_sarimax = sarimax_grid_search(series.values, orders, seasonal_orders, folds=3)
best_sarimax

{'rmse': 836427.6361915525, 'order': (0, 1, 1), 'seasonal_order': (0, 1, 1, 7)}

In [67]:
# LightGBM time-series CV (expanding window via TimeSeriesSplit)
from sklearn.model_selection import TimeSeriesSplit

def lgbm_time_cv(df_sc, features, folds=3):
    tscv = TimeSeriesSplit(n_splits=folds)
    rmses = []
    for tr_idx, va_idx in tscv.split(df_sc):
        tr, va = df_sc.iloc[tr_idx], df_sc.iloc[va_idx]
        model, rmse, _ = train_lgbm(tr, va)
        rmses.append(rmse)
    return float(np.mean(rmses))

# Example usage
df_sc = d[(d.store_id==sid) & (d.product_category==cat)].sort_values('date')
cv_rmse_lgbm = lgbm_time_cv(df_sc, FEATURES, folds=3)
cv_rmse_lgbm

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 441
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 19
[LightGBM] [Info] Start training from score 940426.107483
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[111]	valid_0's rmse: 574643
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 874
[LightGBM] [Info] Number of data points in the train set: 192, number of used features: 19
[LightGBM] [Info] Start training from score 881966.159835
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[105]	valid_0's rmse: 581030
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the o

590230.2673245225

In [69]:
from prophet import Prophet

def fit_prophet_series(df_sc):
    m = Prophet(seasonality_mode='multiplicative', weekly_seasonality=True, yearly_seasonality=True)
    ts = df_sc[['date','revenue']].rename(columns={'date':'ds','revenue':'y'})
    m.fit(ts)
    return m

def forecast_prophet_model(m, last_date, horizon):
    future = pd.date_range(last_date + pd.Timedelta(days=1), periods=horizon, freq='D')
    fc = m.predict(pd.DataFrame({'ds': future}))
    return pd.DataFrame({'date': future, 'yhat': fc['yhat']})

results = []
best_models = {}

for (sid, cat), group in d.groupby(['store_id','product_category']):
    tr = group[group['date'] <= CUTOFF].sort_values('date')
    va = group[group['date'] >  CUTOFF].sort_values('date')
    if len(tr) < 120 or len(va) < 14:
        continue

    y_va = va['revenue'].values

    # Prophet
    try:
        m_prophet = fit_prophet_series(group[['date','revenue']])
        fc_prophet = forecast_prophet_model(m_prophet, tr['date'].max(), len(va))['yhat'].values
        rmse_prophet = float(np.sqrt(((y_va - fc_prophet)**2).mean()))
    except Exception:
        rmse_prophet, m_prophet = np.inf, None

    # SARIMAX (using a default; you can plug best_sarimax per series)
    try:
        sarimax_res = SARIMAX(tr['revenue'], order=(1,1,1), seasonal_order=(1,1,1,7),
                              enforce_stationarity=False, enforce_invertibility=False).fit(disp=False)
        fc_sarimax = sarimax_res.get_forecast(steps=len(va)).predicted_mean.values
        rmse_sarimax = float(np.sqrt(((y_va - fc_sarimax)**2).mean()))
    except Exception:
        rmse_sarimax, sarimax_res = np.inf, None

    # LightGBM
    try:
        m_lgbm, rmse_lgbm, mape_lgbm = train_lgbm(tr, va)
    except Exception:
        rmse_lgbm, m_lgbm, mape_lgbm = np.inf, None, None

    scores = {'prophet': rmse_prophet, 'sarimax': rmse_sarimax, 'lgbm': rmse_lgbm}
    best_type = min(scores, key=scores.get)
    best_models[(sid, cat)] = {'type': best_type,
                               'model': {'prophet': m_prophet, 'sarimax': sarimax_res, 'lgbm': m_lgbm}[best_type]}

    results.append({
        'store_id': sid, 'category': cat,
        'rmse_prophet': rmse_prophet,
        'rmse_sarimax': rmse_sarimax,
        'rmse_lgbm': rmse_lgbm,
        'best_type': best_type
    })

res_df = pd.DataFrame(results).sort_values('rmse_lgbm')
res_df.head(10)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1451
[LightGBM] [Info] Number of data points in the train set: 334, number of used features: 20
[LightGBM] [Info] Start training from score 837467.326522
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[25]	valid_0's rmse: 697934



An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1535
[LightGBM] [Info] Number of data points in the train set: 358, number of used features: 20
[LightGBM] [Info] Start training from score 847982.276554
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[56]	valid_0's rmse: 514340



An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1501
[LightGBM] [Info] Number of data points in the train set: 348, number of used features: 20
[LightGBM] [Info] Start training from score 791320.952183
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[34]	valid_0's rmse: 492447



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1626
[LightGBM] [Info] Number of data points in the train set: 384, number of used features: 20
[LightGBM] [Info] Start training from score 843730.498617
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[46]	valid_0's rmse: 634745



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1569
[LightGBM] [Info] Number of data points in the train set: 366, number of used features: 20
[LightGBM] [Info] Start training from score 861311.297938
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[69]	valid_0's rmse: 702886



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1639
[LightGBM] [Info] Number of data points in the train set: 389, number of used features: 20
[LightGBM] [Info] Start training from score 787886.772255
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[98]	valid_0's rmse: 778373



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1443
[LightGBM] [Info] Number of data points in the train set: 334, number of used features: 20
[LightGBM] [Info] Start training from score 888244.419002
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[70]	valid_0's rmse: 595462



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014848 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1575
[LightGBM] [Info] Number of data points in the train set: 368, number of used features: 20
[LightGBM] [Info] Start training from score 828173.709457
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 723500



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014921 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1481
[LightGBM] [Info] Number of data points in the train set: 344, number of used features: 20
[LightGBM] [Info] Start training from score 796676.898957
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	valid_0's rmse: 550717



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1534
[LightGBM] [Info] Number of data points in the train set: 361, number of used features: 20
[LightGBM] [Info] Start training from score 812273.650531
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[22]	valid_0's rmse: 626533



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1653
[LightGBM] [Info] Number of data points in the train set: 389, number of used features: 20
[LightGBM] [Info] Start training from score 856167.280582
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 501202



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1630
[LightGBM] [Info] Number of data points in the train set: 382, number of used features: 20
[LightGBM] [Info] Start training from score 829031.865355
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[33]	valid_0's rmse: 468534



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1504
[LightGBM] [Info] Number of data points in the train set: 350, number of used features: 20
[LightGBM] [Info] Start training from score 870608.885876
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	valid_0's rmse: 519453



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 361, number of used features: 20
[LightGBM] [Info] Start training from score 840860.560024
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[29]	valid_0's rmse: 420926



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1498
[LightGBM] [Info] Number of data points in the train set: 349, number of used features: 20
[LightGBM] [Info] Start training from score 792097.011670
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[38]	valid_0's rmse: 587374



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010705 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1575
[LightGBM] [Info] Number of data points in the train set: 368, number of used features: 20
[LightGBM] [Info] Start training from score 885936.479652
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[42]	valid_0's rmse: 629036



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1479
[LightGBM] [Info] Number of data points in the train set: 341, number of used features: 20
[LightGBM] [Info] Start training from score 853389.300784
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[39]	valid_0's rmse: 653884



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1609
[LightGBM] [Info] Number of data points in the train set: 375, number of used features: 20
[LightGBM] [Info] Start training from score 808689.826770
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	valid_0's rmse: 522277



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1607
[LightGBM] [Info] Number of data points in the train set: 377, number of used features: 20
[LightGBM] [Info] Start training from score 852443.003500
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[54]	valid_0's rmse: 466416



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011619 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1528
[LightGBM] [Info] Number of data points in the train set: 359, number of used features: 20
[LightGBM] [Info] Start training from score 823606.889086
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 465444



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1531
[LightGBM] [Info] Number of data points in the train set: 358, number of used features: 20
[LightGBM] [Info] Start training from score 883925.579225
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[30]	valid_0's rmse: 694213



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1560
[LightGBM] [Info] Number of data points in the train set: 362, number of used features: 20
[LightGBM] [Info] Start training from score 869467.199396
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[37]	valid_0's rmse: 468896



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1527
[LightGBM] [Info] Number of data points in the train set: 358, number of used features: 20
[LightGBM] [Info] Start training from score 806058.482453
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[69]	valid_0's rmse: 673156



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1443
[LightGBM] [Info] Number of data points in the train set: 336, number of used features: 20
[LightGBM] [Info] Start training from score 827663.151682
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[19]	valid_0's rmse: 474955



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1553
[LightGBM] [Info] Number of data points in the train set: 362, number of used features: 20
[LightGBM] [Info] Start training from score 804825.438862
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[34]	valid_0's rmse: 488915



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014833 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1477
[LightGBM] [Info] Number of data points in the train set: 341, number of used features: 20
[LightGBM] [Info] Start training from score 850024.691480
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[30]	valid_0's rmse: 484337



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1568
[LightGBM] [Info] Number of data points in the train set: 368, number of used features: 20
[LightGBM] [Info] Start training from score 905616.725951
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[23]	valid_0's rmse: 475736



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001639 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1548
[LightGBM] [Info] Number of data points in the train set: 363, number of used features: 20
[LightGBM] [Info] Start training from score 848631.063304
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[37]	valid_0's rmse: 815858



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1537
[LightGBM] [Info] Number of data points in the train set: 360, number of used features: 20
[LightGBM] [Info] Start training from score 787699.689265
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[54]	valid_0's rmse: 474971



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1590
[LightGBM] [Info] Number of data points in the train set: 373, number of used features: 20
[LightGBM] [Info] Start training from score 823953.884143
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[45]	valid_0's rmse: 570856



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1544
[LightGBM] [Info] Number of data points in the train set: 361, number of used features: 20
[LightGBM] [Info] Start training from score 845017.500315
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[45]	valid_0's rmse: 695762



An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1627
[LightGBM] [Info] Number of data points in the train set: 385, number of used features: 20
[LightGBM] [Info] Start training from score 786083.296290
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[63]	valid_0's rmse: 410961



An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014470 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1487
[LightGBM] [Info] Number of data points in the train set: 343, number of used features: 20
[LightGBM] [Info] Start training from score 835082.056780
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[49]	valid_0's rmse: 552280



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006716 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1525
[LightGBM] [Info] Number of data points in the train set: 359, number of used features: 20
[LightGBM] [Info] Start training from score 829203.940413
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 609436



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1518
[LightGBM] [Info] Number of data points in the train set: 354, number of used features: 20
[LightGBM] [Info] Start training from score 845065.247466
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[24]	valid_0's rmse: 740755



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014871 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1485
[LightGBM] [Info] Number of data points in the train set: 341, number of used features: 20
[LightGBM] [Info] Start training from score 840151.029971
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[85]	valid_0's rmse: 470521



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1507
[LightGBM] [Info] Number of data points in the train set: 351, number of used features: 20
[LightGBM] [Info] Start training from score 797679.880573
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[64]	valid_0's rmse: 627278



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014744 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1452
[LightGBM] [Info] Number of data points in the train set: 337, number of used features: 20
[LightGBM] [Info] Start training from score 775659.537605
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[49]	valid_0's rmse: 569214



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1484
[LightGBM] [Info] Number of data points in the train set: 346, number of used features: 20
[LightGBM] [Info] Start training from score 869993.088900
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	valid_0's rmse: 779851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005491 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1521
[LightGBM] [Info] Number of data points in the train set: 358, number of used features: 20
[LightGBM] [Info] Start training from score 881538.493626
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	valid_0's rmse: 60346


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



Unnamed: 0,store_id,category,rmse_prophet,rmse_sarimax,rmse_lgbm,best_type
31,STR_107,Fashion,inf,649035.332931,410960.574847,lgbm
13,STR_103,Home Appliances,inf,714899.613252,420925.538968,lgbm
19,STR_104,Sports,inf,721658.782246,465443.975935,lgbm
18,STR_104,Home Appliances,inf,634167.268753,466416.229899,lgbm
11,STR_103,Fashion,inf,802368.072013,468533.835025,lgbm
21,STR_105,Fashion,inf,664304.662403,468895.866236,lgbm
35,STR_108,Electronics,inf,738561.561865,470521.121401,lgbm
23,STR_105,Home Appliances,inf,575100.516383,474954.539128,lgbm
28,STR_106,Home Appliances,inf,712039.997866,474971.410684,lgbm
26,STR_106,Fashion,inf,705925.021159,475735.605081,lgbm


In [72]:
# Generate 30-day forecasts per best model with iterative ML for lgbm
HORIZON = 30
fc_all = []

# Define the iterative_forecast_lgbm function here
def iterative_forecast_lgbm(model, group_data, horizon, features_list):
    forecast_results = []
    # Work on a copy to avoid modifying the original group_data
    current_series = group_data.copy().sort_values('date').reset_index(drop=True)

    # Ensure current_series has all necessary columns for feature calculation
    # Fill any missing columns (e.g., from previous dropna) with NaNs if they were in FEATURES
    initial_cols_for_features = ['units_sold', 'unit_price', 'revenue']
    for col in initial_cols_for_features:
        if col not in current_series.columns:
            current_series[col] = np.nan # Add missing columns if any

    for i in range(1, horizon + 1):
        # Determine the next date to forecast
        next_date = current_series['date'].max() + pd.Timedelta(days=1)

        # Create a new row for feature calculation
        new_row = {'date': next_date,
                   'store_id': current_series['store_id'].iloc[-1],
                   'product_category': current_series['product_category'].iloc[-1]}

        # Add date-derived features
        new_row['dow'] = next_date.dayofweek
        new_row['week'] = next_date.isocalendar().week
        new_row['month'] = next_date.month
        new_row['quarter'] = next_date.quarter
        new_row['year'] = next_date.year
        new_row['is_weekend'] = int(next_date.dayofweek in [5, 6])

        # For 'units_sold' and 'unit_price', use the last known values as a base for feature generation.
        # This is a simplification; ideally these would also be forecasted or determined by external factors.
        new_row['units_sold'] = current_series['units_sold'].iloc[-1]
        new_row['unit_price'] = current_series['unit_price'].iloc[-1]
        new_row['revenue'] = np.nan # Placeholder for the prediction

        # Convert to a DataFrame row
        new_row_df = pd.DataFrame([new_row])

        # Temporarily concatenate to calculate features based on extended history
        # Make sure columns are consistent before concat
        temp_history = pd.concat([current_series, new_row_df], ignore_index=True)
        temp_history = temp_history.sort_values('date').reset_index(drop=True)

        # Calculate lagged features for the new row (last row of temp_history)
        for lag in [1, 7, 28]:
            temp_history[f'revenue_lag_{lag}'] = temp_history['revenue'].shift(lag)
            temp_history[f'units_lag_{lag}'] = temp_history['units_sold'].shift(lag)
            temp_history[f'price_lag_{lag}'] = temp_history['unit_price'].shift(lag)

        # Calculate rolling features for the new row (last row of temp_history)
        for win in [7, 28]:
            temp_history[f'revenue_roll_{win}'] = temp_history['revenue'].rolling(win).mean()
            temp_history[f'units_roll_{win}'] = temp_history['units_sold'].rolling(win).mean()

        # Extract the features for the current prediction step (the last row of temp_history)
        # Ensure all features in `features_list` are present and in correct order
        predict_features_df = temp_history.iloc[[-1]][features_list]

        # Make the prediction
        pred_revenue = model.predict(predict_features_df)[0]

        # Update the 'revenue' for the new row in `current_series` with the prediction
        current_series.loc[current_series.index[-1], 'revenue'] = pred_revenue

        # Store the forecast result
        forecast_results.append({'date': next_date, 'yhat': pred_revenue})

    return pd.DataFrame(forecast_results)

for (sid, cat), bm in best_models.items():
    group = d[(d.store_id==sid) & (d.product_category==cat)].sort_values('date')
    last_date = group['date'].max()

    if bm['type'] == 'lgbm':
        fc = iterative_forecast_lgbm(bm['model'], group, HORIZON, FEATURES)
    elif bm['type'] == 'prophet':
        fc = forecast_prophet_model(bm['model'], last_date, HORIZON)
    else: # SARIMAX
        steps = bm['model'].get_forecast(steps=HORIZON).predicted_mean
        future_dates = pd.date_range(last_date + pd.Timedelta(days=1), periods=HORIZON)
        fc = pd.DataFrame({'date': future_dates, 'yhat': steps})

    fc['store_id'] = sid
    fc['category'] = cat
    fc_all.append(fc)

fc_all = pd.concat(fc_all, ignore_index=True)

# Bottom-up reconciliation to store and total
fc_store = (fc_all.groupby(['date','store_id'], as_index=False)
            .agg({'yhat':'sum'}).rename(columns={'yhat':'yhat_store'}))
fc_total = (fc_store.groupby('date', as_index=False)
            .agg({'yhat_store':'sum'}).rename(columns={'yhat_store':'yhat_total'}))

In [73]:
# Save artifacts to Drive
OUT_DIR = '/content/drive/MyDrive/retail_artifacts'
import os
os.makedirs(OUT_DIR, exist_ok=True)

d.to_parquet(f"{OUT_DIR}/daily.parquet")
fc_all.to_parquet(f"{OUT_DIR}/forecasts.parquet")
fc_store.to_parquet(f"{OUT_DIR}/forecasts_store.parquet")
fc_total.to_parquet(f"{OUT_DIR}/forecasts_total.parquet")

# Minimal Streamlit app
app_code = """
import streamlit as st
import pandas as pd
import plotly.express as px

st.set_page_config(page_title='Retail Analytics & Forecasting', layout='wide')
daily = pd.read_parquet('daily.parquet')
forecasts = pd.read_parquet('forecasts.parquet')

st.sidebar.title('Filters')
store = st.sidebar.selectbox('Store', sorted(daily['store_id'].unique()))
category = st.sidebar.selectbox('Category', sorted(daily[daily['store_id']==store]['category'].unique()))

hist = daily[(daily.store_id==store) & (daily.category==category)][['date','revenue']]
future = forecasts[(forecasts.store_id==store) & (forecasts.category==category)][['date','yhat']]

fig = px.line(hist, x='date', y='revenue', title=f'History | {store} · {category}')
fig.add_scatter(x=future['date'], y=future['yhat'], name='Forecast')
st.plotly_chart(fig, use_container_width=True)
"""

with open(f"{OUT_DIR}/app.py", "w") as f:
    f.write(app_code)

print("Artifacts saved to:", OUT_DIR)
print("Run dashboard locally: streamlit run app.py")

Artifacts saved to: /content/drive/MyDrive/retail_artifacts
Run dashboard locally: streamlit run app.py


In [75]:
sid = fc_all['store_id'].sample(1).iloc[0]
cat = fc_all[fc_all['store_id']==sid]['category'].sample(1).iloc[0]
hist = d[(d.store_id==sid) & (d.product_category==cat)][['date','revenue']]
future = fc_all[(fc_all.store_id==sid) & (fc_all.category==cat)][['date','yhat']]
fig = px.line(hist, x='date', y='revenue', title=f'History & forecast | {sid} · {cat}')
fig.add_scatter(x=future['date'], y=future['yhat'], name='Forecast')
fig.show()