# Phase 1: Baseline Inventory Decision System

Objective:
Build a transparent, rule-based weekly inventory allocation system
for Walmart Store × Department units.

This phase establishes:
- A deterministic decision policy
- Logged decision outputs
- Evaluation metrics

No machine learning is used in this phase.


### Imports & Global Config

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# Reproducibility
np.random.seed(42)

# Display
pd.set_option("display.max_columns", 100)
pd.set_option("display.float_format", "{:.3f}".format)


### Load Raw Data

In [2]:
DATA_PATH = "../data/"

train = pd.read_csv(DATA_PATH + "train.csv")
features = pd.read_csv(DATA_PATH + "features.csv")
stores = pd.read_csv(DATA_PATH + "stores.csv")

print("Train:", train.shape)
print("Features:", features.shape)
print("Stores:", stores.shape)


Train: (421570, 5)
Features: (8190, 12)
Stores: (45, 3)


### Parse Dates

In [3]:
for df in [train, features]:
    df["Date"] = pd.to_datetime(train["Date"], dayfirst=True)



### Merge Datasets (Single Decision Table)

In [4]:
# Merge train with features
df = train.merge(
    features,
    on=["Store", "Date", "IsHoliday"],
    how="left"
)

# Merge store metadata
df = df.merge(
    stores,
    on="Store",
    how="left"
)

print("Merged dataset shape:", df.shape)
df.head()


Merged dataset shape: (518521, 16)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096,8.106,A,151315
1,1,1,2010-02-05,24924.5,False,55.32,3.386,6766.44,5147.7,50.82,3639.9,2737.42,223.463,6.573,A,151315
2,1,1,2010-02-12,46039.49,True,38.51,2.548,,,,,,211.242,8.106,A,151315
3,1,1,2010-02-19,41595.55,False,39.93,2.514,,,,,,211.289,8.106,A,151315
4,1,1,2010-02-19,41595.55,False,52.92,3.252,9696.28,292.1,103.78,1133.15,6612.69,223.513,6.573,A,151315


### Sort for Temporal Logic

In [5]:
df = df.sort_values(
    by=["Store", "Dept", "Date"]
).reset_index(drop=True)


### Sanity Checks

In [6]:
# Missing values summary
df.isna().mean().sort_values(ascending=False).head(10)


MarkDown2      0.673
MarkDown4      0.604
MarkDown3      0.596
MarkDown1      0.556
MarkDown5      0.554
CPI            0.160
Unemployment   0.160
Temperature    0.091
Fuel_Price     0.091
IsHoliday      0.000
dtype: float64

### Global Parameters

In [7]:
ROLLING_WINDOW = 12   # weeks
MIN_HISTORY = 6       # minimum weeks required


### Rolling Demand Statistics

In [8]:
df["rolling_mean"] = (
    df.groupby(["Store", "Dept"])["Weekly_Sales"]
      .transform(lambda x: x.rolling(ROLLING_WINDOW, min_periods=MIN_HISTORY).mean())
)

df["rolling_std"] = (
    df.groupby(["Store", "Dept"])["Weekly_Sales"]
      .transform(lambda x: x.rolling(ROLLING_WINDOW, min_periods=MIN_HISTORY).std())
)


### Fallback Global Department Mean

In [9]:
global_dept_mean = (
    df.groupby("Dept")["Weekly_Sales"]
      .mean()
)

global_dept_mean.head()


Dept
1   19210.141
2   43586.047
3   11807.359
4   25950.053
5   21275.389
Name: Weekly_Sales, dtype: float64

### Holiday Demand Amplification (Store-Level)

In [10]:
holiday_stats = (
    df.groupby(["Store", "IsHoliday"])["Weekly_Sales"]
      .mean()
      .unstack()
)

holiday_stats["holiday_amp"] = (
    holiday_stats[True] / holiday_stats[False]
)

holiday_stats["holiday_amp"].describe()


count   45.000
mean     1.065
std      0.047
min      0.934
25%      1.041
50%      1.068
75%      1.082
max      1.160
Name: holiday_amp, dtype: float64

### Attach Holiday Amplification

In [11]:
df = df.merge(
    holiday_stats["holiday_amp"],
    on="Store",
    how="left"
)

# Safety clamp (avoid extreme ratios)
df["holiday_amp"] = df["holiday_amp"].clip(0.8, 1.5)


### Expected Demand Estimation

In [12]:
def estimate_expected_demand(row):
    if not np.isnan(row["rolling_mean"]):
        base = row["rolling_mean"]
    else:
        base = global_dept_mean.loc[row["Dept"]]
    
    if row["IsHoliday"]:
        return base * row["holiday_amp"]
    else:
        return base

df["expected_demand"] = df.apply(
    estimate_expected_demand,
    axis=1
)


### Capacity Assumption (Simple but Explicit)
We assume each Store–Dept can stock up to 120% of its rolling mean.

In [13]:
df["capacity"] = (
    1.2 * df["rolling_mean"]
)

# Fallback if rolling_mean is missing
df["capacity"] = df["capacity"].fillna(
    1.2 * df["expected_demand"]
)


### Inventory Allocation Decision

In [14]:
df["allocated_inventory"] = np.minimum(
    df["expected_demand"],
    df["capacity"]
)


### Outcome Metrics

In [15]:
df["unmet_demand"] = np.maximum(
    df["Weekly_Sales"] - df["allocated_inventory"],
    0
)

df["overstock"] = np.maximum(
    df["allocated_inventory"] - df["Weekly_Sales"],
    0
)

# Service level: defined only when demand > 0
df["service_level"] = np.where(
    df["Weekly_Sales"] > 0,
    (df["Weekly_Sales"] - df["unmet_demand"]) / df["Weekly_Sales"],
    np.nan
)


### Clean Decision Log

In [16]:
decision_log = df[
    [
        "Store", "Dept", "Date",
        "Weekly_Sales",
        "expected_demand",
        "allocated_inventory",
        "unmet_demand",
        "overstock",
        "service_level",
        "IsHoliday"
    ]
].copy()

decision_log.head()


Unnamed: 0,Store,Dept,Date,Weekly_Sales,expected_demand,allocated_inventory,unmet_demand,overstock,service_level,IsHoliday
0,1,1,2010-02-05,24924.5,19210.141,19210.141,5714.359,0.0,0.771,False
1,1,1,2010-02-05,24924.5,19210.141,19210.141,5714.359,0.0,0.771,False
2,1,1,2010-02-12,46039.49,20672.714,20672.714,25366.776,0.0,0.449,True
3,1,1,2010-02-19,41595.55,19210.141,19210.141,22385.409,0.0,0.462,False
4,1,1,2010-02-19,41595.55,19210.141,19210.141,22385.409,0.0,0.462,False


### Aggregate Evaluation Metrics

In [17]:
metrics = {
    "avg_service_level": decision_log["service_level"].mean(skipna=True),
    "total_unmet_demand": decision_log["unmet_demand"].sum(),
    "total_overstock": decision_log["overstock"].sum(),
    "mean_weekly_allocation": decision_log["allocated_inventory"].mean(),
    "allocation_volatility": decision_log["allocated_inventory"].std(),
    "fraction_zero_demand": (decision_log["Weekly_Sales"] == 0).mean()
}

metrics_df = pd.DataFrame.from_dict(metrics, orient="index", columns=["value"])
metrics_df



Unnamed: 0,value
avg_service_level,0.924
total_unmet_demand,591447024.367
total_overstock,638448237.214
mean_weekly_allocation,16060.674
allocation_volatility,21916.06
fraction_zero_demand,0.0


### Save Outputs

In [18]:
OUTPUT_PATH = "../outputs/"

decision_log.to_csv(
    OUTPUT_PATH + "phase1_decisions.csv",
    index=False
)

metrics_df.to_csv(
    OUTPUT_PATH + "phase1_metrics.csv"
)


## Phase 1 Complete

We have:
- A deterministic inventory decision system
- Logged weekly decisions
- Quantified outcomes
- No learning, no tuning, no hindsight

This system will be intentionally stressed in Phase 2
to expose decision degradation.


In [19]:
decision_log["service_level"].describe()


count   516891.000
mean         0.924
std          0.310
min        -71.600
25%          0.914
50%          1.000
75%          1.000
max          1.000
Name: service_level, dtype: float64