# Imports

In [1]:
# === Imports (centralized) ===
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.seasonal import STL, seasonal_decompose
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import boxcox
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LassoCV, RidgeCV, LinearRegression, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Ridge
import boto3
from pyathena import connect

# Tables Uploads

In [4]:
# === Load .env, read AWS creds, and connect to Athena ===
import os, sys, subprocess, pathlib

# Try to import python-dotenv, install if missing
try:
    from dotenv import load_dotenv, find_dotenv  # type: ignore
except Exception:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "python-dotenv"])
        from dotenv import load_dotenv, find_dotenv  # type: ignore
    except Exception:
        # Minimal fallback: parse .env manually (no override of existing env)
        p = pathlib.Path(".env")
        if p.exists():
            for line in p.read_text().splitlines():
                line = line.strip()
                if line and not line.startswith("#") and "=" in line:
                    k, v = line.split("=", 1)
                    os.environ.setdefault(k.strip(), v.strip())

# Load .env (no override of existing env)
load_dotenv(find_dotenv(usecwd=True) or ".env", override=False)

# Read config (don’t print secrets)
REGION = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
WORKGROUP = os.getenv("ATHENA_WORKGROUP", "primary")
ATHENA_RESULTS_S3 = os.getenv("ATHENA_RESULTS_S3", "").strip()

# If using long-lived AKIA keys, clear stale session token
if os.getenv("AWS_ACCESS_KEY_ID", "").startswith("AKIA"):
    os.environ.pop("AWS_SESSION_TOKEN", None)

# Guard: staging dir required
if not ATHENA_RESULTS_S3:
    raise RuntimeError(
        "ATHENA_RESULTS_S3 is not set. Add s3://bucket/prefix to your .env, then rerun this cell."
    )

# Optional quick identity check
try:
    import boto3
    sts = boto3.client("sts", region_name=REGION)
    ident = sts.get_caller_identity()
    print("Caller:", ident.get("Arn"))
except Exception as e:
    print("Warning: could not verify identity via STS:", e)

# Connect to Athena (env creds or default chain)
from pyathena import connect

kwargs = dict(
    s3_staging_dir=ATHENA_RESULTS_S3,
    region_name=REGION,
    work_group=WORKGROUP,
)

ak = os.getenv("AWS_ACCESS_KEY_ID")
sk = os.getenv("AWS_SECRET_ACCESS_KEY")
st = os.getenv("AWS_SESSION_TOKEN")

if ak and sk:
    kwargs.update(aws_access_key_id=ak, aws_secret_access_key=sk)
    if st:
        kwargs.update(aws_session_token=st)

conn = connect(**kwargs)
print("Athena connect: OK")


Caller: arn:aws:iam::216097839164:user/Nayyera2
Athena connect: OK


In [5]:
# UPLOADING Heatmap
conn = connect(s3_staging_dir=ATHENA_RESULTS_S3, region_name=REGION, work_group="primary")
sql = "SELECT * FROM little_caesars.heatmap"
heatmap = pd.read_sql(sql, conn)

# Make sure store_number is numeric
heatmap["store_number"] = pd.to_numeric(heatmap["store_number"], errors="coerce")

# Remove store_number from 364001 to 364010
heatmap = heatmap[~heatmap["store_number"].between(364001, 364010)]

heatmap.head()




  heatmap = pd.read_sql(sql, conn)


Unnamed: 0,store_number,carryout_sales_pizza,carryout_sales_non_cash_coupon,delivery_sales_pizza_mobile_app,carryout_sales_school_lunch_or_church_org,carryout_sales_promotional_sales,carryout_sales_sandwich,carryout_sales_salad,carryout_sales_dessert,carryout_sales_alcoholic_beverage,...,ingestion_date,organizational_expenses,rou_lease_expense,business_tax_expense,controllable_profits,profit_sales_pct,store_lc,year,month,day
10,364011,100040.32,0,9678.5,0,0,0,0,0,0,...,09/18/2025,,,,,,,2022,8,8
11,364012,39693.93,0,5317.66,0,0,0,0,0,0,...,09/18/2025,,,,,,,2022,8,8
12,364013,58118.02,0,7808.24,0,0,0,0,0,0,...,09/18/2025,,,,,,,2022,8,8
13,364014,66244.67,0,8472.24,0,0,0,0,0,0,...,09/18/2025,,,,,,,2022,8,8
14,364015,29086.17,0,5594.41,0,0,0,0,0,0,...,09/18/2025,,,,,,,2022,8,8


In [6]:
# UPLOADING Hierachy
conn = connect(s3_staging_dir=ATHENA_RESULTS_S3, region_name=REGION, work_group="primary")
sql = "SELECT * FROM little_caesars.hierarchy"
geo_stores = pd.read_sql(sql, conn)
geo_stores.columns = geo_stores.iloc[0]
geo_stores = geo_stores[1:].reset_index(drop=True)
geo_stores.head()

  geo_stores = pd.read_sql(sql, conn)


Unnamed: 0,Store_Number,Heatmap_Store_Number,Reduced_Store_Number,Store_Name,Full_Store,Director,DO,Market,Store_Address,State,Director_Email,Email_Address,Date,None,2025
0,03647-00011,364011,11,KCK,KCK 03647-00011,,Josh Butler,KS Market,151 S. 18th Street Suite K,Kansas,,jbutler@apricusqsr.com,2023-10-22,,2025
1,03647-00012,364012,12,Leavenworth,Leavenworth 03647-00012,,Josh Butler,KS Market,2024 S. 4th Street Leavenworth,Kansas,,jbutler@apricusqsr.com,2023-10-22,,2025
2,03647-00013,364013,13,Topeka,Topeka 03647-00013,,Josh Butler,KS Market,5329 SW 21st Street Topeka,Kansas,,jbutler@apricusqsr.com,2023-10-22,,2025
3,03647-00014,364014,14,Olathe,Olathe 03647-00014,,Josh Butler,KS Market,904 E Santa Fe St. Olathe,Kansas,,jbutler@apricusqsr.com,2023-10-22,,2025
4,03647-00015,364015,15,Lawrence,Lawrence 03647-00015,,Josh Butler,KS Market,1528 W 23rd St. Lawrence,Kansas,,jbutler@apricusqsr.com,2023-10-22,,2025


In [7]:
# UPLOADING Budget
conn = connect(s3_staging_dir=ATHENA_RESULTS_S3, region_name=REGION, work_group="primary")
sql = "SELECT * FROM little_caesars.budget_heatmap"
budget = pd.read_sql(sql, conn)
budget.head()

  budget = pd.read_sql(sql, conn)


Unnamed: 0,period,401k_company_match,administrative_expenses,adv_caesarfund_print,aws,accrued_delivery_expense_mobile_app,accrued_delivery_transaction_expense_d,accrued_tips_withholding_mobile_app,admin_employee_benefits,admin_payroll_taxes,...,uniforms,use_tax,waste_removal,water,workers_compensation,store_name,store_number,period_number,endingdate,year
0,1,0,,0,11569.868625,0,0,0,0,46.279474500000006,...,46.279474500000006,0,100,100,149.05646589731424,Illinois,364070,1,2025-01-27,2025
1,2,0,,0,11829.344625,0,0,0,0,47.3173785,...,47.3173785,0,100,100,149.05646589731424,Illinois,364070,2,2025-02-24,2025
2,3,0,,0,12478.137,0,0,0,0,49.912548,...,49.912548,0,100,100,149.05646589731424,Illinois,364070,3,2025-03-24,2025
3,4,0,,0,12446.196000000002,0,0,0,0,49.78478400000001,...,49.78478400000001,0,100,100,149.05646589731424,Illinois,364070,4,2025-04-21,2025
4,5,0,,0,12377.134875,0,0,0,0,49.5085395,...,49.5085395,0,100,100,149.05646589731424,Illinois,364070,5,2025-05-19,2025


In [8]:
# Load
sql = "SELECT * FROM data_science_lgg.heatmap_features_final"
c_heatmap = pd.read_sql(sql, conn)

# Pick store column
store_col = "store_number" if "store_number" in c_heatmap.columns else \
            [c for c in c_heatmap.columns if "store" in c.lower() and "number" in c.lower()][0]

# Exclude set (as strings)
excl = {str(364000+i) for i in range(1,11)}

# Normalize to digits-only for matching
key = c_heatmap[store_col].astype(str).str.replace(r"\D", "", regex=True).str.lstrip("0")

print("Present BEFORE:", sorted(key[key.isin(excl)].unique().tolist()))
c_heatmap = c_heatmap.loc[~key.isin(excl)].copy()
key_after = c_heatmap[store_col].astype(str).str.replace(r"\D", "", regex=True).str.lstrip("0")
print("Present AFTER:", sorted(key_after[key_after.isin(excl)].unique().tolist()))
c_heatmap.head()

  c_heatmap = pd.read_sql(sql, conn)


Present BEFORE: ['364001', '364002', '364003', '364004', '364005', '364006', '364007', '364008', '364009', '364010']
Present AFTER: []


Unnamed: 0,store_number,fiscal_year,fiscal_period,event_period,event_date,state,total_net_sales,total_labor_cost,total_food_or_paper_cost,total_occupancy,...,halloween_flag,march_madness_flag,nfl_playoffs_flag,nfl_regular_season_flag,nfl_sponsorship_flag,new_years_eve_flag,super_bowl_flag,thanksgiving_eve_flag,cpi,drive_thru_flag
0,364026,2025,9,P9,2025-09-08,Michigan,141491.0,33028.0,52333.0,3836.0,...,0,0,0,1,0,0,0,0,323.364,0
1,364027,2025,9,P9,2025-09-08,New York,69846.0,20308.0,22999.0,1985.0,...,0,0,0,1,0,0,0,0,323.364,0
2,364028,2025,9,P9,2025-09-08,New York,82860.0,23354.0,27119.0,2613.0,...,0,0,0,1,0,0,0,0,323.364,0
3,364029,2025,9,P9,2025-09-08,New York,99428.0,27359.0,32793.0,3308.0,...,0,0,0,1,0,0,0,0,323.364,0
4,364030,2025,9,P9,2025-09-08,New York,117337.0,30020.0,38891.0,2686.0,...,0,0,0,1,0,0,0,0,323.364,0


In [None]:
# # UPLOADING labor
# conn = connect(s3_staging_dir=ATHENA_RESULTS_S3, region_name=REGION, work_group="primary")
# sql = "SELECT * FROM little_caesars.weekly_labor"
# labor = pd.read_sql(sql, conn)
# labor.head()

  labor = pd.read_sql(sql, conn)


Unnamed: 0,dept,job_title,store_number,event_date,employee_no,first_name,last_name,normal_hours,ot_hours,dt_hrs,...,normal_pay,ot_pay,dt_pay,total_pay,labor_percent,ingestion_date,weekly_labor_job_filter,year,month,day
0,Management,SHIFT LEADER,03647-00011,2024-07-17,6265887,michael,smith,7.12,0.0,0.0,...,92.56,0.0,0.0,92.56,2.05,09/08/2025,,2024,7,17
1,Management,SHIFT LEADER,03647-00011,2024-05-24,6265887,michael,smith,7.13,0.0,0.0,...,92.69,0.0,0.0,92.69,1.62,09/08/2025,,2024,5,24
2,Staff,CREW,03647-00011,2024-05-24,6346557,christopher,rasa,4.12,0.0,0.0,...,49.44,0.0,0.0,49.44,0.86,09/08/2025,,2024,5,24
3,Management,SHIFT LEADER,03647-00011,2024-05-24,6359295,alex,spann,1.53,0.0,0.0,...,21.42,0.0,0.0,21.42,0.37,09/08/2025,,2024,5,24
4,Staff,CREW,03647-00011,2024-07-17,6346557,christopher,rasa,5.65,0.0,0.0,...,67.8,0.0,0.0,67.8,1.51,09/08/2025,,2024,7,17


In [18]:
events = pd.read_excel('../data/events_flag.xlsx')
cal = pd.read_excel('../data/fiscal_calendar.xlsx')
cpi_raw = pd.read_excel('../data/cpi_data.xlsx')

In [19]:
# stores to excludeb agreed with Leonor
exclude_stores = [364001, 364002, 364003, 364004, 364005, 364006, 364007, 364008, 364009, 364010]

# Select required columns and exclude the stores
h = heatmap[
    ~heatmap["store_number"].isin(exclude_stores)
][[
    "event_date",
    "total_net_sales",
    "store_number",
    "total_advertising"
]]

h.head()


Unnamed: 0,event_date,total_net_sales,store_number,total_advertising
10,2022-08-08,159470.66000000003,364011,7016.71
11,2022-08-08,68507.49,364012,3014.33
12,2022-08-08,96940.32,364013,4265.37
13,2022-08-08,115751.10000000002,364014,5093.05
14,2022-08-08,52933.84,364015,2329.09


final runner

# Testing RUnner

### Long History 

In [20]:
# Super Bowl events taken as-is from c_heatmap (uses event_date that already exists)
import pandas as pd

EVENT_FEAT = 'Super_Bowl'  # standardize the feature name

events_df = (
    c_heatmap[['event_date','store_number','super_bowl_flag']].copy()
      .rename(columns={'super_bowl_flag': EVENT_FEAT})
)

# Ensure date is datetime-normalized; keep one row per date×store (max keeps 1 if any row was flagged)
events_df['event_date'] = pd.to_datetime(events_df['event_date'], errors='coerce').dt.normalize()
events_df['store_number'] = events_df['store_number'].astype(str).str.strip()
events_df[EVENT_FEAT] = pd.to_numeric(events_df[EVENT_FEAT], errors='coerce').fillna(0).astype(int)
events_df = (events_df
             .groupby(['event_date','store_number'], as_index=False)[EVENT_FEAT]
             .max())

# Quick sanity
ones = int((events_df[EVENT_FEAT] == 1).sum())
zeros = int((events_df[EVENT_FEAT] == 0).sum())
print(f"events_df[{EVENT_FEAT}] — ones={ones}, zeros={zeros}, rows={len(events_df)}")


events_df[Super_Bowl] — ones=108, zeros=1245, rows=1353


In [22]:
import os, sys

# If you're inside the /forecasting folder, add the parent folder to sys.path
if os.path.basename(os.getcwd()).lower() == 'forecasting':
    sys.path.insert(0, os.path.abspath('..'))

from forecasting.runner_utils import (
    ensure_project_root,
    run_pooled_ridge,
    filter_active_and_excluded,
)

# Make sure all relative paths resolve correctly from the project root
ensure_project_root()


In [23]:
# === Build pooled forecasts (long-history layer) ===
from forecasting.runner_utils import run_pooled_ridge, filter_active_and_excluded

# Pick inputs already in memory
cal_df = cal2 if "cal2" in globals() else cal
cpi_df = cpi_raw if "cpi_raw" in globals() else (cpi if "cpi" in globals() else None)
events = events_df if "events_df" in globals() else None

# Run pooled ridge (H=6) with your feature set
fc_store, feats, metrics = run_pooled_ridge(
    heatmap=heatmap[["event_date","total_net_sales","store_number","total_advertising"]],
    calendar_df=cal_df,
    cpi_df=cpi_df,
    events_df=events,
    feature_list=["total_net_sales_m1","adv_m1","CPI_m1","p13_sin","p13_cos","is_5w","Super Bowl"],
    alpha=5.0, holdout_k=6, min_history_train=12, min_history_eval=18, H=6, verbose=True,
)
print("[Train] metrics:", metrics)
print("[OK] fc_store:", fc_store.shape)

# Optional: filter to active stores + model excludes (using geo_stores if present)
excluded = set() 
if "geo_stores" in globals():
    fc_store = filter_active_and_excluded(fc_store, geo_stores=geo_stores, excluded_keys=excluded)
    print("[Filter] kept rows:", len(fc_store))

# Optional: attach State (if geo_stores has Heatmap_Store_Number)
if "geo_stores" in globals() and "Heatmap_Store_Number" in geo_stores.columns:
    geo = geo_stores[["Heatmap_Store_Number","State"]].copy()
    geo["Heatmap_Store_Number"] = geo["Heatmap_Store_Number"].astype(str).str.strip()
    fc_w_state = (fc_store.merge(geo, left_on="store_number", right_on="Heatmap_Store_Number", how="left")
                         .drop(columns="Heatmap_Store_Number"))
    print("[OK] fc_w_state:", fc_w_state.shape)
else:
    fc_w_state = None

# Quick preview
with pd.option_context("display.float_format", "{:,.0f}".format):
    print("fc_store (top 5):"); display(fc_store.head(5))
    if fc_w_state is not None:
        print("fc_w_state (top 5):"); display(fc_w_state.head(5))


[FeatureBuilder] final features rows: 54224 (dropped 62)
[Train] holdout WAPE: 0.0079 | rows_eval=366
[Train] stores_train=61 | stores_eval=61 | rows_train=53858 | rows_eval=366 | alpha=5.0
[FeatureBuilder] next-period design rows: 61
[FeatureBuilder] next-period design rows: 61
[FeatureBuilder] next-period design rows: 61
[FeatureBuilder] next-period design rows: 61
[FeatureBuilder] next-period design rows: 61
[FeatureBuilder] next-period design rows: 61
[Train] metrics: {'stores_train': 61.0, 'stores_eval': 61.0, 'rows_train': 53858.0, 'rows_eval': 366.0, 'alpha': 5.0, 'wape_holdout': 0.007945409232796005}
[OK] fc_store: (366, 5)
[Filter] kept rows: 366
[OK] fc_w_state: (366, 6)
fc_store (top 5):


Unnamed: 0,store_number,fiscal_year,fiscal_period,forecast,horizon
0,364011,2025,12,175349,1
1,364012,2025,12,66425,1
2,364013,2025,12,102666,1
3,364014,2025,12,105411,1
4,364015,2025,12,45412,1


fc_w_state (top 5):


Unnamed: 0,store_number,fiscal_year,fiscal_period,forecast,horizon,State
0,364011,2025,12,175349,1,Kansas
1,364012,2025,12,66425,1,Kansas
2,364013,2025,12,102666,1,Kansas
3,364014,2025,12,105411,1,Kansas
4,364015,2025,12,45412,1,Kansas


### Peer Creation 

In [None]:
# Build peer_table_k3_short from c_heatmap (uses runner_utils wrappers)
from forecasting.runner_utils import (
    build_profile_from_c_heatmap, pca_from_profile, lock_k3_clusters,
    summarize_store_history, peers_for_short_only
)

# 1) Profile -> PCA -> locked k=3 clusters
pcav1_profile = build_profile_from_c_heatmap(c_heatmap)
pca_scores_v1, _, _ = pca_from_profile(pcav1_profile)
cluster_table_k3 = lock_k3_clusters(pca_scores_v1)

# 2) Store summary + peers for short stores (<18 periods)
store_summary = summarize_store_history(c_heatmap)
peer_table_k3_short = peers_for_short_only(cluster_table_k3, store_summary, threshold=18, max_peers=3)

print("[OK] peer_table_k3_short:", peer_table_k3_short.shape)


[OK] peer_table_k3_short: (30, 16)


In [None]:
# === Build peer forecasts for short stores and print checks ===
from forecasting.runner_utils import (
    normalize_hist, normalize_fc_store, split_short_long, last6_axis_from_fc,
    build_peer_scaled_short, finalize_forecasts, join_actuals, wape
)
import importlib, forecasting.runner_utils as ru
importlib.reload(ru)  # ensure we have the latest helper

# 1) Normalize + identify short stores
hist = normalize_hist(c_heatmap)
fc   = normalize_fc_store(fc_store)
short_set, _ = split_short_long(hist, threshold=18)
print("Short stores (count):", len(short_set))

# 2) Axis: use the last 6 (FY, FP) seen in fc_store
axis6 = last6_axis_from_fc(fc)
print("Axis6 (FY,FP):", list(map(tuple, axis6.to_records(index=False))))

# 3) Build peer-scaled forecasts for short stores
#    Uses peers' pooled-model forecasts from fc_store on the axis above
peer_fc = build_peer_scaled_short(hist, eval_hist3 if "eval_hist3" in globals() else None,
                                  fc, peer_table_k3_short, axis6)
print("[OK] peer_fc:", peer_fc.shape)
print("Peer stores (unique):", peer_fc["store_number"].nunique() if not peer_fc.empty else 0)

# 4) Merge with pooled model (keep model for long stores; peer for short)
final_fc = finalize_forecasts(fc, peer_fc, short_set)
print("[OK] final_fc:", final_fc.shape)
print("Rows by source:\n", final_fc["source"].value_counts(dropna=False))

# 5) Optional quick WAPE where actuals exist (will be N/A if future periods only)
joined = join_actuals(final_fc, hist)
for src in ["model","peer"]:
    sub = joined[joined["source"] == src]
    print(f"WAPE ({src}):", "N/A" if sub.empty else round(wape(sub["actual"], sub["forecast"]), 4))

# 6) Preview a couple of peer stores (if any)
if not peer_fc.empty:
    some_peer_store = peer_fc["store_number"].iloc[0]
    print(f"\nSample peer store: {some_peer_store}")
    display(final_fc[final_fc["store_number"] == some_peer_store]
            .sort_values(["fiscal_year","fiscal_period"])
            .head(10))


Short stores (count): 30
Axis6 (FY,FP): [(np.int64(2025), np.int64(12)), (np.int64(2025), np.int64(13)), (np.int64(2026), np.int64(1)), (np.int64(2026), np.int64(2)), (np.int64(2026), np.int64(3)), (np.int64(2026), np.int64(4))]
[OK] peer_fc: (180, 5)
Peer stores (unique): 30
[OK] final_fc: (366, 6)
Rows by source:
 source
model    192
peer     174
Name: count, dtype: int64
WAPE (model): nan
WAPE (peer): nan

Sample peer store: 364045


Unnamed: 0,store_number,fiscal_year,fiscal_period,forecast,horizon,source
34,364045,2025,12,74529.198072,1,peer
95,364045,2025,13,79024.256667,2,peer
156,364045,2026,1,84065.521502,3,peer
217,364045,2026,2,89201.725864,4,peer
278,364045,2026,3,93823.655529,5,peer
339,364045,2026,4,97330.075675,6,peer


### Full Forecasts based on best model

In [None]:
import pandas as pd
import numpy as np
from forecasting.pooled_ridge import ForecastConfig, PooledRidgeForecaster

# ------------------------ Params + basic setup ------------------------
N_LAST = 6  # last-N fiscal periods
fy, fp = "fiscal_year", "fiscal_period"
target = "total_net_sales"
store_col = "store_number"

assert "feats" in globals(), "feats must be in memory."

# Candidate features; keep only those that exist in feats
CAND_FEATURES = [
    "total_net_sales_m1",
    "adv_m1",
    "CPI_m1",
    "p13_sin",
    "p13_cos",
    "is_5w",
    "Super Bowl",
]
FEATURES = [c for c in CAND_FEATURES if c in feats.columns]
if not FEATURES:
    raise RuntimeError("No candidate FEATURES found in feats.")

# ------------------------ Normalize feats ------------------------
f = feats.copy()
f[store_col] = f[store_col].astype(str).str.strip()
for c in [fy, fp]:
    f[c] = pd.to_numeric(f[c], errors="coerce").astype("Int64")

# ------------------------ Actuals: from c_heatmap if available ------------------------
if "c_heatmap" in globals():
    h = c_heatmap.copy()
    h[store_col] = h[store_col].astype(str).str.strip()
    h[fy] = pd.to_numeric(h[fy], errors="coerce").astype("Int64")
    h[fp] = pd.to_numeric(h[fp], errors="coerce").astype("Int64")
    # support either target or 'actual' column
    if target in h.columns:
        h["actual"] = pd.to_numeric(h[target], errors="coerce")
    elif "actual" in h.columns:
        h["actual"] = pd.to_numeric(h["actual"], errors="coerce")
    else:
        raise KeyError("c_heatmap must contain total_net_sales or actual.")
else:
    h = f[[store_col, fy, fp, target]].copy()
    h["actual"] = pd.to_numeric(h[target], errors="coerce")

# Drop rows with missing fiscal info
h = h.dropna(subset=[fy, fp])

# ------------------------ Global last-N fiscal axis ------------------------
axisN = (
    f[[fy, fp]]
    .dropna()
    .drop_duplicates()
    .sort_values([fy, fp])
    .tail(N_LAST)
    .reset_index(drop=True)
)
if axisN.empty or len(axisN) < N_LAST:
    raise RuntimeError(f"Could not build last {N_LAST} fiscal periods axis.")

# ------------------------ Cohort + history metadata ------------------------
per_counts = (
    h.assign(_per=lambda d: d[fy] * 100 + d[fp])
    .groupby(store_col, as_index=False)
    .agg(n_hist_periods=("_per", "nunique"))
)
per_counts[store_col] = per_counts[store_col].astype(str).str.strip()
per_counts["cohort"] = np.where(per_counts["n_hist_periods"] >= 18, "long", "short")

# n_eval_periods: how many actuals in the last-N axis per store
h_axis = h.merge(axisN, on=[fy, fp], how="inner")
n_eval = (
    h_axis.groupby(store_col)["actual"]
    .apply(lambda s: int(s.notna().sum()))
    .reset_index(name="n_eval_periods")
)

# Keep only stores with at least 1 actual in the last-N periods
stores_keep = set(
    n_eval.loc[n_eval["n_eval_periods"] > 0, store_col].astype(str)
)

# ------------------------ True OOS pooled (remove last-N, train, predict last-N) ------------------------
cfg = ForecastConfig(
    feature_list=FEATURES,
    alpha=5.0,
    min_history_train=1,
    min_history_eval=9999,
    holdout_k=0,
    verbose=False,
)

# Mark rows that belong to axisN
axisN_key = axisN.copy()
axisN_key["_axis_flag"] = 1
f_axis = f.merge(axisN_key[[fy, fp, "_axis_flag"]], on=[fy, fp], how="left")

# Training = all rows NOT in axisN
f_train = f_axis[f_axis["_axis_flag"].isna()].drop(columns=["_axis_flag"])
# Eval = only axisN rows
f_eval = f_axis[f_axis["_axis_flag"] == 1].drop(columns=["_axis_flag"])

if f_train.empty:
    raise RuntimeError("No training rows remain after removing last-N axis.")
if f_eval.empty:
    raise RuntimeError("No eval rows found on last-N axis.")

fore = PooledRidgeForecaster(cfg)
_ = fore.train(f_train)  # pooled with store fixed effects, no internal holdout

pred_pooled = fore.predict(f_eval)  # must include [store_number, fy, fp, forecast]
if "forecast" not in pred_pooled.columns:
    raise KeyError("fore.predict(f_eval) must return a 'forecast' column.")
pred_pooled = pred_pooled.rename(columns={"forecast": "yhat_pooled"})
pred_pooled[store_col] = pred_pooled[store_col].astype(str).str.strip()

# ------------------------ True OOS peer (scaled peers on last-N) ------------------------
# Pick peer table
if "peer_table" in globals():
    peer_tbl = peer_table.copy()
elif "peer_table_k3_short" in globals():
    peer_tbl = peer_table_k3_short.copy()
else:
    peer_tbl = None

yhat_peer_rows = []

def _avg_first10(store_id: str) -> float:
    g = (
        h.loc[h[store_col] == store_id, [fy, fp, "actual"]]
        .dropna(subset=["actual"])
        .sort_values([fy, fp])
        .head(10)
    )
    return float("nan") if g.empty else float(g["actual"].mean())

pred_peer = pd.DataFrame(columns=[store_col, fy, fp, "yhat_peer"])

if peer_tbl is not None and not peer_tbl.empty:
    peers = peer_tbl.copy()
    for c in [store_col, "peer1", "peer2", "peer3"]:
        if c in peers.columns:
            peers[c] = peers[c].astype(str).str.strip()

    # OOS pooled preds for peers (on same axis), to be scaled
    pooled_axis = pred_pooled[[store_col, fy, fp, "yhat_pooled"]].copy()

    # Short stores only
    short_ids = set(
        per_counts.loc[per_counts["cohort"] == "short", store_col].astype(str)
    )

    for st in sorted(short_ids):
        if st not in stores_keep:
            continue  # must have at least 1 actual in last-N
        r = peers.loc[peers[store_col] == st]
        if r.empty:
            continue
        r = r.iloc[0]

        # Scaling ratio: short's early mean / peers' early mean
        avg_short = _avg_first10(st)
        peer10 = [
            _avg_first10(str(r.get(p, "")))
            for p in ["peer1", "peer2", "peer3"]
            if p in peers.columns
        ]
        peer10 = [v for v in peer10 if pd.notna(v)]
        if pd.isna(avg_short) or not peer10 or np.mean(peer10) == 0:
            continue
        scale = float(avg_short / np.mean(peer10))

        # Build yhat_peer on axis by averaging scaled peer OOS pooled preds
        for _, a in axisN.iterrows():
            fy_i, fp_i = int(a[fy]), int(a[fp])
            vals = []
            for pcol in ["peer1", "peer2", "peer3"]:
                if pcol not in peers.columns:
                    continue
                pid = str(r.get(pcol, "")).strip()
                if not pid or pid.lower() == "nan":
                    continue
                g = pooled_axis.loc[
                    (pooled_axis[store_col] == pid)
                    & (pooled_axis[fy] == fy_i)
                    & (pooled_axis[fp] == fp_i),
                    "yhat_pooled",
                ]
                if not g.empty and pd.notna(g.iloc[0]):
                    vals.append(float(g.iloc[0]))
            if vals:
                yhat_peer_rows.append(
                    {
                        store_col: st,
                        fy: fy_i,
                        fp: fp_i,
                        "yhat_peer": scale * float(np.mean(vals)),
                    }
                )

    if yhat_peer_rows:
        pred_peer = pd.DataFrame(yhat_peer_rows)

# ------------------------ Assemble eval_hist ------------------------
axis_all = axisN.copy()
axis_all["label"] = (
    axis_all[fy].astype(int).astype(str)
    + "-"
    + axis_all[fp].astype(int).astype(str)
)

# Base: all stores with at least 1 actual in the last-N periods
base = (
    h.merge(axis_all, on=[fy, fp], how="inner")[
        [store_col, fy, fp, "label", "actual"]
    ]
)
base[store_col] = base[store_col].astype(str).str.strip()
base = base[base[store_col].isin(stores_keep)]

eval_hist = (
    base.merge(pred_pooled, on=[store_col, fy, fp], how="left")
    .merge(pred_peer, on=[store_col, fy, fp], how="left")
    .merge(
        per_counts[[store_col, "n_hist_periods", "cohort"]],
        on=store_col,
        how="left",
    )
    .merge(n_eval, on=store_col, how="left")
    .sort_values([store_col, fy, fp])
    .reset_index(drop=True)
)

cols = [
    store_col,
    fy,
    fp,
    "label",
    "actual",
    "yhat_pooled",
    "yhat_peer",
    "cohort",
    "n_hist_periods",
    "n_eval_periods",
]
eval_hist = eval_hist[[c for c in cols if c in eval_hist.columns]]

# ------------------------ HARDEN: flatten to 1-D numeric ------------------------
def _flatten_col(series: pd.Series) -> pd.Series:
    def _one(v):
        # Unpack common nested cases
        if isinstance(v, pd.Series) and not v.empty:
            return v.iloc[0]
        if isinstance(v, pd.DataFrame) and not v.empty:
            return v.iloc[0, 0]
        if isinstance(v, (list, np.ndarray)) and len(v) > 0:
            return v[0]
        return v
    return pd.to_numeric(series.apply(_one), errors="coerce")

for col in ["actual", "yhat_pooled", "yhat_peer"]:
    if col in eval_hist.columns:
        eval_hist[col] = _flatten_col(eval_hist[col])

print("[OK] eval_hist:", eval_hist.shape)
print(eval_hist.dtypes)
display(eval_hist.head(5))


[OK] eval_hist: (300, 10)
store_number       object
fiscal_year         Int64
fiscal_period       Int64
label              object
actual            float64
yhat_pooled       float64
yhat_peer         float64
cohort             object
n_hist_periods      int64
n_eval_periods      int64
dtype: object


Unnamed: 0,store_number,fiscal_year,fiscal_period,label,actual,yhat_pooled,yhat_peer,cohort,n_hist_periods,n_eval_periods
0,364011,2025,5,2025-5,163710.0,150909.365745,,long,45,5
1,364011,2025,6,2025-6,175363.0,149341.820265,,long,45,5
2,364011,2025,7,2025-7,185265.0,157280.458212,,long,45,5
3,364011,2025,8,2025-8,181532.0,163681.517322,,long,45,5
4,364011,2025,9,2025-9,175932.0,159917.29706,,long,45,5


In [None]:
# Final 6-period forecasts per store using eval_hist + WAPE decision
from forecasting.runner_utils import (
    normalize_hist, normalize_fc_store, last6_axis_from_fc,
    build_peer_scaled_short, join_actuals, wape
)
from forecasting.model_selection import assign_store_models


fy, fp = "fiscal_year", "fiscal_period"
keys = ["store_number", fy, fp]
THRESHOLD_PP = 3.0  # pooled must beat peer by >= 3 pp to switch a short to pooled

assert "eval_hist" in globals() and not eval_hist.empty, "Build eval_hist first."
assert "c_heatmap" in globals() and "fc_store" in globals(), "Need c_heatmap and fc_store."

# ---------------------------------------------------------
# 1) Normalize and build next-6 pooled axis
# ---------------------------------------------------------
hist = normalize_hist(c_heatmap).copy()
fc = normalize_fc_store(fc_store).copy()
hist["store_number"] = hist["store_number"].astype(str).str.strip()
fc["store_number"] = fc["store_number"].astype(str).str.strip()

axis6 = last6_axis_from_fc(fc)
fc6 = fc.merge(axis6, on=[fy, fp], how="inner").copy()

# ---------------------------------------------------------
# 2) Peer forecasts on the same axis (next-6)
# ---------------------------------------------------------
peer_table = globals().get("peer_table_k3_short", globals().get("peer_table", pd.DataFrame()))

peer_fc_raw = build_peer_scaled_short(hist, None, fc, peer_table, axis6) \
    if (peer_table is not None and not peer_table.empty) \
    else pd.DataFrame(columns=keys + ["forecast"])

peer_fc_ren = (
    peer_fc_raw.rename(columns={"forecast":"peer_pred"})[keys + ["peer_pred"]]
    if (peer_fc_raw is not None and not peer_fc_raw.empty and ("forecast" in peer_fc_raw.columns))
    else pd.DataFrame(columns=keys + ["peer_pred"])
)

# ---------------------------------------------------------
# 3) Build per-store WAPEs from eval_hist and assign via decision rule
# ---------------------------------------------------------
eh = eval_hist.copy()
eh["store_number"] = eh["store_number"].astype(str).str.strip()

def _wape(y, yhat):
    y = pd.to_numeric(y, errors="coerce"); yhat = pd.to_numeric(yhat, errors="coerce")
    m = y.notna() & yhat.notna(); y, yhat = y[m], yhat[m]
    denom = y.abs().sum()
    return float("nan") if denom == 0 else float((y - yhat).abs().sum()/denom)

# Cohort per store (last fiscal row)
cohort_df = (
    eh.sort_values([fy, fp])
      .drop_duplicates(subset=["store_number"], keep="last")
      [["store_number","cohort"]]
      .reset_index(drop=True)
)

# WAPE per store (percent)
per_model = (
    eh.groupby("store_number")
      .apply(lambda g: _wape(g["actual"], g["yhat_pooled"]))
      .reset_index(name="WAPE_model_pct")
)
per_model["WAPE_model_pct"] = (per_model["WAPE_model_pct"] * 100).round(2)

per_peer = (
    eh.groupby("store_number")
      .apply(lambda g: _wape(g["actual"], g["yhat_peer"]))
      .reset_index(name="WAPE_peer_pct")
)
per_peer["WAPE_peer_pct"] = (per_peer["WAPE_peer_pct"] * 100).round(2)

combined_local = (
    cohort_df.merge(per_model, on="store_number", how="left")
             .merge(per_peer, on="store_number", how="left")
)

assigned = assign_store_models(
    combined_local,
    cohort_col="cohort",
    pooled_col="WAPE_model_pct",
    peer_col="WAPE_peer_pct",
    assignment_col="assigned_model",
    threshold_pp=THRESHOLD_PP,
)[["store_number","assigned_model"]]

print("Assigned:", assigned["assigned_model"].value_counts(dropna=False).to_dict())

# ---------------------------------------------------------
# 4) Apply assignment to next-6 forecasts
# ---------------------------------------------------------
final = (
    fc6.merge(peer_fc_ren, on=keys, how="left")
       .merge(assigned, on="store_number", how="left")
)

use_peer = (final["assigned_model"] == "peer") & final["peer_pred"].notna()
final["our_forecast"] = pd.to_numeric(
    np.where(use_peer, final["peer_pred"], final["forecast"]),
    errors="coerce"
)
final["source"] = np.where(use_peer, "peer", "model")

print("[OK] final 6-period forecasts:", final.shape)
print("By source:\n", final["source"].value_counts(dropna=False))

# ---------------------------------------------------------
# 5) Optional: print OOS WAPE from eval_hist (the basis of the decision)
# ---------------------------------------------------------
w_model_all = _wape(eh["actual"], eh["yhat_pooled"])
w_peer_all = _wape(eh["actual"], eh["yhat_peer"])

short_mask = eh["cohort"].astype(str).str.lower().eq("short")
long_mask  = eh["cohort"].astype(str).str.lower().eq("long")

w_model_short = _wape(eh.loc[short_mask,"actual"], eh.loc[short_mask,"yhat_pooled"])
w_peer_short  = _wape(eh.loc[short_mask,"actual"], eh.loc[short_mask,"yhat_peer"])
w_model_long  = _wape(eh.loc[long_mask, "actual"], eh.loc[long_mask, "yhat_pooled"])

print("OOS WAPE (eval_hist) — pooled:", round(w_model_all, 4),
      "| peer:", "N/A" if np.isnan(w_peer_all) else round(w_peer_all, 4))
print("OOS WAPE (short) — pooled:", round(w_model_short, 4),
      "| peer:", "N/A" if np.isnan(w_peer_short) else round(w_peer_short, 4))
print("OOS WAPE (long) — pooled:", round(w_model_long, 4))

# ---------------------------------------------------------
# 6) Output
# ---------------------------------------------------------
final_forecasts_6 = final[["store_number", fy, fp, "our_forecast", "source"]] \
    .sort_values(["store_number", fy, fp]).copy()

display(final_forecasts_6.head(20))


Assigned: {'pooled': 43, 'peer': 17}
[OK] final 6-period forecasts: (360, 9)
By source:
 source
model    258
peer     102
Name: count, dtype: int64
OOS WAPE (eval_hist) — pooled: 0.056 | peer: 0.1104
OOS WAPE (short) — pooled: 0.0567 | peer: 0.1104
OOS WAPE (long) — pooled: 0.0554


  .apply(lambda g: _wape(g["actual"], g["yhat_pooled"]))
  .apply(lambda g: _wape(g["actual"], g["yhat_peer"]))


Unnamed: 0,store_number,fiscal_year,fiscal_period,our_forecast,source
0,364011,2025,12,164077.325104,model
60,364011,2025,13,155850.849829,model
120,364011,2026,1,150530.718319,model
180,364011,2026,2,147233.78454,model
240,364011,2026,3,145020.272917,model
300,364011,2026,4,143030.315268,model
1,364012,2025,12,65159.364478,model
61,364012,2025,13,65291.611107,model
121,364012,2026,1,66712.238541,model
181,364012,2026,2,68851.283092,model


# Total Forecasts by STate 

In [None]:
# Forecasts by State (with store count) + Totals by fiscal period (using the assigned model per store)
import pandas as pd
fy, fp = "fiscal_year", "fiscal_period"

# 0) Pick the correct forecast table and standardize columns
if "final_forecasts_6" in globals():
    fc = final_forecasts_6.copy()  # cols: store_number, fy, fp, our_forecast, source
elif "final" in globals():
    fc = final.copy()               # cols: store_number, fy, fp, our_forecast, source (and more)
elif "final_fc" in globals():
    fc = final_fc.copy()            # already has forecast, source (from earlier helpers)
else:
    raise AssertionError("No forecast table found. Expected one of: final_forecasts_6, final, final_fc")

# Normalize names to use 'forecast' consistently
if "our_forecast" in fc.columns and "forecast" not in fc.columns:
    fc = fc.rename(columns={"our_forecast": "forecast"})
if "source" not in fc.columns:
    fc["source"] = "model"  # fallback if missing (all pooled)

# 1) Normalize keys and forecast type
fc["store_number"] = fc["store_number"].astype(str).str.strip()
fc[fy] = pd.to_numeric(fc[fy], errors="coerce").astype("Int64")
fc[fp] = pd.to_numeric(fc[fp], errors="coerce").astype("Int64")
fc["forecast"] = pd.to_numeric(fc["forecast"], errors="coerce")

# 2) Build store -> State map (prefer geo_stores; else infer from c_heatmap if it has a state-like column)
state_map = None
if "geo_stores" in globals() and {"Heatmap_Store_Number","State"} <= set(geo_stores.columns):
    g = geo_stores[["Heatmap_Store_Number","State"]].copy()
    g["Heatmap_Store_Number"] = g["Heatmap_Store_Number"].astype(str).str.strip()
    state_map = g.rename(columns={"Heatmap_Store_Number":"store_number"})
elif "c_heatmap" in globals() and any("state" in c.lower() for c in c_heatmap.columns):
    ch = c_heatmap.copy()
    ch["store_number"] = ch["store_number"].astype(str).str.strip()
    st_col = [c for c in ch.columns if "state" in c.lower()][0]
    ch[st_col] = ch[st_col].astype(str).str.strip()
    state_map = (ch.dropna(subset=[st_col])
                   .groupby("store_number", as_index=False)[st_col]
                   .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.dropna().iloc[0]))
    state_map = state_map.rename(columns={st_col:"State"})

# 3) Attach State
if state_map is not None:
    fc_state = fc.merge(state_map, on="store_number", how="left")
    fc_state["State"] = fc_state["State"].fillna("Unknown")
else:
    fc_state = fc.assign(State="Unknown")

# 4) Forecasts by state with contributing store count (using chosen model per row)
by_state = (fc_state.groupby(["State", fy, fp], as_index=False)
            .agg(forecast=("forecast","sum"),
                 stores=("store_number","nunique"))
            .sort_values(["State", fy, fp]))

print("Forecasts by state (top 15 by forecast):")
with pd.option_context("display.float_format", "{:,.0f}".format):
    display(by_state.sort_values("forecast", ascending=False).head(15))

# 4b)show how many rows used peer vs model per state (sanity check on assignment usage)
by_state_source = (fc_state.groupby(["State", "source"], as_index=False)
                   .agg(rows=("forecast","size"),
                        stores=("store_number","nunique"))
                   .sort_values(["State","source"]))
print("Rows by source per state (peer vs model):")
display(by_state_source)

# 5) Totals by fiscal period (all states combined), with contributing store count
by_period = (fc_state.groupby([fy, fp], as_index=False)
             .agg(forecast=("forecast","sum"),
                  stores=("store_number","nunique"))
             .sort_values([fy, fp]))
print("Total forecasts by period:")
with pd.option_context("display.float_format", "{:,.0f}".format):
    display(by_period)

# 6) Grand totals and unique stores
grand_total = by_period["forecast"].sum()
unique_store_count = fc_state["store_number"].nunique()
print(f"Grand total forecast (all periods): {grand_total:,.0f}")
print(f"Unique stores contributing overall: {unique_store_count:,}")


Forecasts by state (top 15 by forecast):


Unnamed: 0,State,fiscal_year,fiscal_period,forecast,stores
29,Michigan,2026,4,2688497,23
28,Michigan,2026,3,2635246,23
27,Michigan,2026,2,2563369,23
26,Michigan,2026,1,2486940,23
25,Michigan,2025,13,2420502,23
24,Michigan,2025,12,2375598,23
35,New York,2026,4,1262880,13
34,New York,2026,3,1237741,13
33,New York,2026,2,1203300,13
32,New York,2026,1,1167692,13


Rows by source per state (peer vs model):


Unnamed: 0,State,source,rows,stores
0,Alabama,model,18,3
1,Alabama,peer,6,1
2,Indiana,model,12,2
3,Indiana,peer,42,7
4,Kansas,model,42,7
5,Kentucky,model,6,1
6,Kentucky,peer,18,3
7,Michigan,model,114,19
8,Michigan,peer,24,4
9,New York,model,66,11


Total forecasts by period:


Unnamed: 0,fiscal_year,fiscal_period,forecast,stores
0,2025,12,5390945,60
1,2025,13,5508761,60
2,2026,1,5682218,60
3,2026,2,5881409,60
4,2026,3,6068625,60
5,2026,4,6207369,60


Grand total forecast (all periods): 34,739,326
Unique stores contributing overall: 60


# Budget Gap

In [None]:
# === Budget gap for ALL forecasts (using assigned model per row) ===
import re, numpy as np, pandas as pd

fy, fp = "fiscal_year", "fiscal_period"

def dkey(x):
    s = "" if pd.isna(x) else str(x).strip().replace(".0","")
    return re.sub(r"\D","", s)

# 0) Inputs: budget + one of final_forecasts_6, final, or final_fc
assert "budget" in globals(), "budget DataFrame not found."
if "final_forecasts_6" in globals():
    pred_src = final_forecasts_6.copy()   # cols: store_number, fy, fp, our_forecast, source
elif "final" in globals():
    pred_src = final.copy()               # cols: store_number, fy, fp, our_forecast, source (and more)
elif "final_fc" in globals():
    pred_src = final_fc.copy()            # already has forecast; may or may not have source
else:
    raise AssertionError("No forecast table found. Expected one of: final_forecasts_6, final, final_fc")

# 1) Standardize forecast table and keys
if "our_forecast" in pred_src.columns and "forecast" not in pred_src.columns:
    pred_src = pred_src.rename(columns={"our_forecast": "forecast"})
if "source" not in pred_src.columns:
    pred_src["source"] = "model"

pb_fc = pred_src[["store_number", fy, fp, "forecast", "source"]].copy()
pb_fc["store_number"]  = pb_fc["store_number"].astype(str).str.strip()
pb_fc["store_key"]     = pb_fc["store_number"].apply(dkey)
pb_fc[fy]              = pd.to_numeric(pb_fc[fy], errors="coerce").astype("Int64")
pb_fc[fp]              = pd.to_numeric(pb_fc[fp], errors="coerce").astype("Int64")
pb_fc["forecast"]      = pd.to_numeric(pb_fc["forecast"], errors="coerce")

# 2) Normalize budget columns
b = budget.copy()
if "year" in b.columns and fy not in b.columns:
    b = b.rename(columns={"year": fy})
if "period_number" in b.columns and fp not in b.columns:
    b = b.rename(columns={"period_number": fp})
if "budget_total_net_sales" not in b.columns and "total_net_sales" in b.columns:
    b = b.rename(columns={"total_net_sales": "budget_total_net_sales"})

bud_key_col = None
for c in ("store_number","Heatmap_Store_Number","heatmap_storenumber","Store_Number","store","storeid"):
    if c in b.columns:
        bud_key_col = c; break
if bud_key_col is None:
    raise KeyError("Could not find a store id column in budget.")

b = b.rename(columns={bud_key_col:"store_raw"}).copy()
b["store_raw"] = b["store_raw"].astype(str).str.strip()
b["store_key"] = b["store_raw"].apply(dkey)
for c in (fy, fp, "budget_total_net_sales"):
    if c in b.columns:
        b[c] = pd.to_numeric(b[c], errors="coerce")
b = b[["store_key", fy, fp, "budget_total_net_sales"]].dropna(subset=[fy, fp])

# 3) Merge and compute gaps
planning_base = (
    pb_fc.merge(b, on=["store_key", fy, fp], how="left")
         .drop(columns=["store_key"])
         .rename(columns={"forecast": "forecast_baseline"})
)
planning_base["forecast_baseline"]      = pd.to_numeric(planning_base["forecast_baseline"], errors="coerce")
planning_base["budget_total_net_sales"] = pd.to_numeric(planning_base["budget_total_net_sales"], errors="coerce")

planning_base["gap_value"] = planning_base["forecast_baseline"] - planning_base["budget_total_net_sales"]
planning_base["gap_percent"] = np.where(
    planning_base["budget_total_net_sales"] > 0,
    planning_base["gap_value"] / planning_base["budget_total_net_sales"],
    np.nan
)

# 4) Display compact preview
def fmt_int(x):  return "" if pd.isna(x) else f"{int(round(float(x))):,}"
def fmt_pct(x):
    if pd.isna(x): return ""
    v = float(x) * 100.0
    v = 0.0 if abs(v) < 0.05 else v
    return f"{v:.1f}%"

cols = ["source","store_number", fy, fp, "forecast_baseline", "budget_total_net_sales", "gap_value", "gap_percent"]
out = planning_base[cols].sort_values([fy, fp, "store_number"]).head(15).copy()
for c in ["forecast_baseline","budget_total_net_sales","gap_value"]:
    out[c] = out[c].map(fmt_int)
out["gap_percent"] = out["gap_percent"].map(fmt_pct)

print("Budget gap (top 15):")
display(out)

# 5) Totals by period
by_period = (planning_base.groupby([fy, fp], as_index=False)
             .agg(total_forecast=("forecast_baseline","sum"),
                  total_budget =("budget_total_net_sales","sum"),
                  gap_value    =("gap_value","sum"),
                  stores       =("store_number","nunique")))
print("Totals by period:")
with pd.option_context("display.float_format", "{:,.0f}".format):
    display(by_period.sort_values([fy, fp]))


Budget gap (top 15):


Unnamed: 0,source,store_number,fiscal_year,fiscal_period,forecast_baseline,budget_total_net_sales,gap_value,gap_percent
0,model,364011,2025,12,164077,161104,2973,1.8%
6,model,364012,2025,12,65159,63470,1689,2.7%
12,model,364013,2025,12,99613,90075,9538,10.6%
18,model,364014,2025,12,103646,85251,18395,21.6%
24,model,364015,2025,12,45083,45514,-431,-0.9%
30,model,364016,2025,12,34841,31623,3218,10.2%
36,model,364017,2025,12,105414,104521,893,0.9%
42,model,364018,2025,12,82269,86372,-4103,-4.8%
48,model,364019,2025,12,46895,52864,-5969,-11.3%
54,model,364020,2025,12,78251,80619,-2368,-2.9%


Totals by period:


Unnamed: 0,fiscal_year,fiscal_period,total_forecast,total_budget,gap_value,stores
0,2025,12,5390945,5653459,-262513,60
1,2025,13,5508761,5546126,-37365,60
2,2026,1,5682218,0,0,60
3,2026,2,5881409,0,0,60
4,2026,3,6068625,0,0,60
5,2026,4,6207369,0,0,60


# Labor Predictions

In [None]:
# Cell A — Labor COST 
import importlib, forecasting.labor as fl
fl = importlib.reload(fl)
build_labor_cost_forecast = fl.build_labor_cost_forecast

# Pick your session objects
final_fc_df = (
    final_forecasts_6 if 'final_forecasts_6' in globals()
    else final if 'final' in globals()
    else final_fc
)

store_state_src = (
    geo_stores if 'geo_stores' in globals()
    else (c_heatmap if 'c_heatmap' in globals() else None)
)

# Choose a cost source with both cost + sales history
if 'heatmap' in globals() and {'total_labor_cost','total_net_sales'} <= set(heatmap.columns):
    cost_source = heatmap
else:
    cost_source = labor

labor_cost_fc = build_labor_cost_forecast(
    final_fc=final_fc_df,
    store_state_source=store_state_src,
    cost_source=cost_source,
    ratio_df=None,
    allowed_store_numbers=None,
    excluded_store_numbers=None,
)

print(labor_cost_fc.shape)
print(labor_cost_fc.head(10))


(360, 8)
  source store_number  fiscal_year  fiscal_period   State  sales_forecast  \
0  model       364011         2025             12  Kansas   164077.325104   
1  model       364011         2025             13  Kansas   155850.849829   
2  model       364011         2026              1  Kansas   150530.718319   
3  model       364011         2026              2  Kansas   147233.784540   
4  model       364011         2026              3  Kansas   145020.272917   
5  model       364011         2026              4  Kansas   143030.315268   
6  model       364012         2025             12  Kansas    65159.364478   
7  model       364012         2025             13  Kansas    65291.611107   
8  model       364012         2026              1  Kansas    66712.238541   
9  model       364012         2026              2  Kansas    68851.283092   

   labor_cost_ratio  pred_total_labor_cost  
0          0.181124           29718.361706  
1          0.181124           28228.348582  
2       

In [None]:

from forecasting.labor import build_labor_forecasts


def _pick(*names):
    for n in names:
        if n in globals():
            return globals()[n]
    raise NameError(f"None of {names} found in this session.")

geo_stores_df = _pick("geo_stores")
labor_df = _pick("labor")
calendar_df = _pick("cal2", "cal")
sales_hist_df = _pick("feats", "heatmap")
final_fc_df = _pick("final_forecasts_6", "final", "final_fc")

# Optional filters (set to None for all stores)
allowed_hm_keys = None
excluded_hm_keys = None

# Build forecasts
fc_hours, fc_dept = build_labor_forecasts(
    geo_stores=geo_stores_df,
    labor=labor_df,
    calendar=calendar_df,
    sales_history=sales_hist_df,
    final_fc=final_fc_df,
    allowed_hm_keys=allowed_hm_keys,
    excluded_hm_keys=excluded_hm_keys,
)

# Quick previews
print("fc_hours:", fc_hours.shape)
print(fc_hours.head(10))

print("fc_dept:", fc_dept.shape)
print(fc_dept.head(10))


  .apply(lambda g: g.assign(share_final=_norm(g["share"].fillna(g["share_chain"])) .values))


fc_hours: (360, 5)
   hm_key  fiscal_year  fiscal_period  sales_forecast  hours_forecast
0  364011         2025             12   164077.325104     2001.623072
1  364011         2025             13   155850.849829     1901.266105
2  364011         2026              1   150530.718319     1836.364401
3  364011         2026              2   147233.784540     1796.144226
4  364011         2026              3   145020.272917     1769.141007
5  364011         2026              4   143030.315268     1744.864983
6  364012         2025             12    65159.364478     1124.835593
7  364012         2025             13    65291.611107     1127.118546
8  364012         2026              1    66712.238541     1151.642608
9  364012         2026              2    68851.283092     1188.568589
fc_dept: (720, 5)
   hm_key  fiscal_year  fiscal_period  department  hours_forecast_dept
0  364011         2025             12  Management           532.314886
1  364011         2025             12       Staff  

In [None]:

fy, fp = "fiscal_year", "fiscal_period"

by_period = (
    fc_hours.groupby([fy, fp], as_index=False)
    .agg(
        total_sales=("sales_forecast", "sum"),
        total_hours=("hours_forecast", "sum")
    )
)

print("Totals by period:")
print(by_period)

# Optional saves
fc_hours.to_csv("labor_hours_store.csv", index=False)
fc_dept.to_csv("labor_hours_store_dept.csv", index=False)


Totals by period:
   fiscal_year  fiscal_period   total_sales   total_hours
0         2025             12  5.390945e+06  83219.087186
1         2025             13  5.508761e+06  85274.091247
2         2026              1  5.682218e+06  88192.652755
3         2026              2  5.881409e+06  91499.882902
4         2026              3  6.068625e+06  94594.356303
5         2026              4  6.207369e+06  96893.148497
