In [2]:
import os
import numpy as np
import pandas as pd

# -------------------------
# Config
# -------------------------
SEED = 42
rng = np.random.default_rng(SEED)

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..")) if os.path.basename(os.getcwd()) == "notebooks" else os.getcwd()
RAW_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
os.makedirs(RAW_DIR, exist_ok=True)

INV_START = pd.Timestamp("2024-01-01")
INV_END   = pd.Timestamp("2024-12-31")
inv_dates = pd.date_range(INV_START, INV_END, freq="D")  # 2024 is leap year => 366 days

# -------------------------
# Load inputs
# -------------------------
fact_parts = pd.read_csv(os.path.join(RAW_DIR, "fact_parts_sales.csv"), parse_dates=["sale_date"])
dim_part = pd.read_csv(os.path.join(RAW_DIR, "dim_part.csv"))
dim_warehouse = pd.read_csv(os.path.join(RAW_DIR, "dim_warehouse.csv"))
dim_dealer = pd.read_csv(os.path.join(RAW_DIR, "dim_dealer.csv"))

print("Parts lines:", len(fact_parts))
print("Parts:", len(dim_part), "Warehouses:", len(dim_warehouse))
print("Inventory days:", len(inv_dates))

# -------------------------
# Map parts sales -> region -> warehouse
# -------------------------
fact_parts = fact_parts.merge(
    dim_dealer[["dealer_id", "region_id"]],
    on="dealer_id",
    how="left",
    suffixes=("", "_dealer")
)

# If fact_parts already had region_id, prefer dealer-derived if missing
if "region_id_dealer" in fact_parts.columns:
    fact_parts["region_id"] = fact_parts["region_id"].fillna(fact_parts["region_id_dealer"])
    fact_parts.drop(columns=["region_id_dealer"], inplace=True)

assert fact_parts["region_id"].isna().mean() == 0.0, "Some rows missing region_id after dealer merge."

region_to_warehouse = {
    "R01": "W02",  # North -> Delhi
    "R02": "W01",  # South -> Bangalore
    "R03": "W03",  # West  -> Mumbai
    "R04": "W04",  # East  -> Kolkata
    "R05": "W03",  # Central -> Mumbai (proxy)
    "R06": "W04",  # Northeast -> Kolkata (proxy)
}

fact_parts["warehouse_id"] = fact_parts["region_id"].map(region_to_warehouse)
assert fact_parts["warehouse_id"].isna().mean() == 0.0, "Some rows missing warehouse_id after region mapping."

# -------------------------
# Daily demand from recorded sales (within inventory year)
# -------------------------
demand = fact_parts[(fact_parts["sale_date"] >= INV_START) & (fact_parts["sale_date"] <= INV_END)].copy()

daily_demand = (
    demand.groupby(["warehouse_id", "part_id", "sale_date"])["quantity"]
    .sum()
    .reset_index()
    .rename(columns={"sale_date": "date", "quantity": "demand_qty"})
)

print("Daily demand records:", len(daily_demand))

# Demand lookup dict: (warehouse, part) -> series(date->qty)
demand_dict = {
    (w, p): grp.set_index("date")["demand_qty"]
    for (w, p), grp in daily_demand.groupby(["warehouse_id", "part_id"])
}

# -------------------------
# Build ALL lanes (800 = 4 warehouses x 200 parts) and compute avg_daily_demand + reorder_point
# -------------------------
warehouses = dim_warehouse["warehouse_id"].tolist()
parts = dim_part["part_id"].tolist()

lane_df = pd.MultiIndex.from_product([warehouses, parts], names=["warehouse_id", "part_id"]).to_frame(index=False)

avg_demand_obs = (
    daily_demand.groupby(["warehouse_id", "part_id"])["demand_qty"]
    .mean()
    .reset_index()
    .rename(columns={"demand_qty": "avg_daily_demand"})
)

lane_df = lane_df.merge(avg_demand_obs, on=["warehouse_id", "part_id"], how="left")
lane_df["avg_daily_demand"] = lane_df["avg_daily_demand"].fillna(0.0)

lane_df = lane_df.merge(
    dim_part[["part_id", "lead_time_days", "criticality"]],
    on="part_id",
    how="left"
)

# Baseline demand floors (so lanes are not "dead")
baseline_floor = {"Low": 0.18, "Medium": 0.30, "High": 0.50}
lane_df["avg_daily_demand"] = lane_df.apply(
    lambda r: max(r["avg_daily_demand"], baseline_floor.get(r["criticality"], 0.06)),
    axis=1
)

# Reorder point formula
safety_days = 7
lane_df["reorder_point"] = np.ceil(lane_df["avg_daily_demand"] * (lane_df["lead_time_days"] + safety_days)).astype(int)

# Minimum reorder points by criticality
min_rp = {"Low": 2, "Medium": 4, "High": 8}
lane_df["reorder_point"] = lane_df.apply(
    lambda r: max(r["reorder_point"], min_rp.get(r["criticality"], 2)),
    axis=1
)

print("✅ Lanes:", len(lane_df))
print("Min avg_daily_demand by criticality:")
print(lane_df.groupby("criticality")["avg_daily_demand"].min())

# -------------------------
# Seasonality multipliers (simple)
# -------------------------
seasonal_multiplier = {}
for d in inv_dates:
    if d.month in [3, 4, 5, 6, 10, 11]:      # peak
        seasonal_multiplier[d] = 1.15
    elif d.month in [7, 8, 9]:               # monsoon slowdown
        seasonal_multiplier[d] = 0.90
    else:
        seasonal_multiplier[d] = 1.00

# -------------------------
# Simulate inventory day-by-day for each lane
# -------------------------
inventory_rows = []

for _, row in lane_df.iterrows():
    w = row["warehouse_id"]
    p = row["part_id"]
    lt = int(row["lead_time_days"])
    rp = int(row["reorder_point"])
    crit = row["criticality"]

    # Starting stock (tight)
    start_mult = {"Low": 0.95, "Medium": 0.85, "High": 0.75}
    on_hand = int(max(0, np.ceil(rp * start_mult.get(crit, 0.9) + rng.integers(-3, 4))))

    pending = []  # list of (arrival_date, qty)
    series = demand_dict.get((w, p))

    for d in inv_dates:
        # Receive orders arriving today (with occasional under-fill)
        if pending:
            arriving = [(ad, qty) for (ad, qty) in pending if ad == d]
            if arriving:
                total_qty = 0
                for _, qty in arriving:
                    # 10% chance partial fill
                    if rng.random() < 0.18:
                        qty = int(np.ceil(qty * rng.uniform(0.55, 0.85)))
                    total_qty += qty
                on_hand += int(total_qty)

            pending = [(ad, qty) for (ad, qty) in pending if ad != d]

        # Demand today = recorded sales + baseline (Poisson) * seasonality
        sales_dem = int(series.get(d, 0)) if series is not None else 0
        baseline_dem = int(rng.poisson(lam=float(row["avg_daily_demand"])))
        base_dem = sales_dem + baseline_dem
        dem = int(np.round(base_dem * seasonal_multiplier[d]))

        # Random spike events
        spike_p = 0.015 if crit == "Low" else (0.025 if crit == "Medium" else 0.040)
        if dem > 0 and rng.random() < spike_p:
            dem += int(np.ceil(dem * rng.uniform(1.0, 2.5)))

        on_hand -= dem

        # Stockout flag
        stockout_flag = 1 if on_hand <= 0 else 0
        if on_hand < 0:
            on_hand = 0

        # Reorder later than RP sometimes (planner delay)
        trigger = int(np.floor(rp * rng.uniform(0.35, 0.75)))

        if on_hand <= trigger:
            # Under-order (forecast/budget constraints)
            under_order = rng.uniform(0.35, 0.75)

            # Tight target level
            target_level = int(np.ceil(rp * rng.uniform(1.05, 1.35)))
            need = max(target_level - on_hand, 0)

            # MOQ by criticality
            moq = {"Low": rp, "Medium": int(1.2 * rp), "High": int(1.5 * rp)}.get(crit, rp)
            order_qty = int(max(np.ceil(need * under_order), moq))

            # Supplier delay variability (more for higher criticality)
            delay_max = 6 if crit == "Low" else (8 if crit == "Medium" else 10)
            delay = int(rng.integers(0, delay_max + 1))

            arrival = d + pd.Timedelta(days=lt + delay)
            pending.append((arrival, order_qty))

        inventory_rows.append((d, w, p, on_hand, rp, stockout_flag))

fact_inventory_daily = pd.DataFrame(
    inventory_rows,
    columns=["snapshot_date", "warehouse_id", "part_id", "on_hand_qty", "reorder_point", "stockout_flag"]
)

print("\nRows:", len(fact_inventory_daily))
print("Stockout rate:", fact_inventory_daily["stockout_flag"].mean())
print("Stockout counts:\n", fact_inventory_daily["stockout_flag"].value_counts())

# 2024 is leap year => 366 days
assert len(fact_inventory_daily) == 200 * 4 * 366
print("✅ Inventory daily size correct")

# -------------------------
# Save
# -------------------------
out_path = os.path.join(RAW_DIR, "fact_inventory_daily.csv")
fact_inventory_daily.to_csv(out_path, index=False)
print("✅ Saved:", out_path)

print("\nStockout rate by warehouse:")
print(
    fact_inventory_daily.groupby("warehouse_id")["stockout_flag"]
    .mean()
    .sort_values(ascending=False)
)

Parts lines: 22529
Parts: 200 Warehouses: 4
Inventory days: 366
Daily demand records: 9238
✅ Lanes: 800
Min avg_daily_demand by criticality:
criticality
High      0.50
Low       0.18
Medium    0.30
Name: avg_daily_demand, dtype: float64

Rows: 292800
Stockout rate: 0.023364071038251367
Stockout counts:
 stockout_flag
0    285959
1      6841
Name: count, dtype: int64
✅ Inventory daily size correct
✅ Saved: j:\PROJECTS\volvo-aftersales-analytics\data\raw\fact_inventory_daily.csv

Stockout rate by warehouse:
warehouse_id
W01    0.024645
W04    0.023429
W03    0.022896
W02    0.022486
Name: stockout_flag, dtype: float64
