# Walmart Retail Sales Exploratory Analysis

Exploratory notebook for the Walmart weekly sales dataset used throughout the forecasting portfolio.

## Objectives

- Resolve project paths so the notebook works from the repo root or the project folder.
- Load the raw Walmart sales data and inspect its schema and data quality.
- Generate exploratory visualisations for overall, per-store, and seasonal behaviour.
- Preview the static feature engineering pipeline that powers the training workflow.
- Persist an enriched feature table into `data/processed/` for downstream experimentation.

In [None]:
from __future__ import annotations

import math
from pathlib import Path
from typing import Final

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from hydra.utils import instantiate
from omegaconf import OmegaConf

sns.set_theme(style="whitegrid")
plt.rcParams.update({"figure.figsize": (12, 6), "axes.titlesize": 14, "axes.labelsize": 12})

def resolve_project_dir(project_name: str = "retail_sales_walmart") -> Path:
    """Locate the Walmart project directory regardless of launch location."""
    cwd = Path.cwd().resolve()

    # Common entry points: repo root, project folder, notebooks folder
    if (cwd / "projects" / project_name).exists():
        return cwd / "projects" / project_name

    if cwd.name == project_name and (cwd / "data").exists():
        return cwd

    if cwd.name == "notebooks" and (cwd.parent / "data").exists():
        return cwd.parent

    for parent in cwd.parents:
        if (parent / "projects" / project_name).exists():
            return parent / "projects" / project_name
        if parent.name == project_name and (parent / "data").exists():
            return parent

    raise FileNotFoundError(
        "Unable to locate project directory. Run from the repo root or ensure the "
        "current working directory contains the retail_sales_walmart project."
    )

PROJECT_DIR: Final[Path] = resolve_project_dir()
DATA_DIR: Final[Path] = PROJECT_DIR / "data"
RAW_PATH: Final[Path] = DATA_DIR / "raw" / "Walmart.csv"
PROCESSED_DIR: Final[Path] = DATA_DIR / "processed"
FEATURE_CACHE: Final[Path] = PROCESSED_DIR / "walmart_features_exploratory.parquet"
CONF_DIR: Final[Path] = PROJECT_DIR.parents[1] / "src" / "ml_portfolio" / "conf"
FEATURE_CONFIG_NAME: Final[str] = "walmart_full"
FEATURE_CONFIG_PATH: Final[Path] = CONF_DIR / "feature_engineering" / f"{FEATURE_CONFIG_NAME}.yaml"

print(f"Project directory: {PROJECT_DIR}")
print(f"Raw data path: {RAW_PATH}")
print(f"Feature config: {FEATURE_CONFIG_PATH}")

## 1. Load and inspect the raw dataset

In [None]:
df_raw = pd.read_csv(RAW_PATH)
row_count, col_count = df_raw.shape
print(f"Rows: {row_count:,} | Columns: {col_count}")
df_raw.head()

### Data quality overview
We compute summary statistics, missing-value counts, and duplicate checks over the canonical Store-Date key.

In [None]:
display(df_raw.describe(include="all").transpose())
missing_summary = (
    df_raw.isna().sum().rename("missing_count").to_frame()
    .assign(missing_pct=lambda d: (d["missing_count"] / row_count) * 100)
)
display(missing_summary.sort_values("missing_pct", ascending=False))
duplicate_rows = df_raw.duplicated(subset=["Store", "Date"]).sum()
print(f"Duplicate [Store, Date] pairs: {duplicate_rows}")

## 2. Seasonal and store-level behaviour

In [None]:
df = df_raw.copy()
df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")
df.sort_values(["Store", "Date"], inplace=True)
df["Week"] = df["Date"].dt.isocalendar().week.astype(int)
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Quarter"] = df["Date"].dt.quarter
df["DayOfYear"] = df["Date"].dt.dayofyear

df["Week_sin"] = np.sin(2 * math.pi * df["Week"] / 52)
df["Week_cos"] = np.cos(2 * math.pi * df["Week"] / 52)
df["Month_sin"] = np.sin(2 * math.pi * df["Month"] / 12)
df["Month_cos"] = np.cos(2 * math.pi * df["Month"] / 12)

store_summary = (
    df.groupby("Store")["Weekly_Sales"]
    .agg(["mean", "median", "std", "max"])
    .sort_values("mean", ascending=False)
)
display(store_summary.head(10))
plt.figure(figsize=(12, 5))
sns.lineplot(data=df, x="Date", y="Weekly_Sales", hue="Store", legend=False)
plt.title("Weekly sales per store")
plt.ylabel("Weekly Sales (USD)")
plt.tight_layout()

### Holiday effect snapshot

In [None]:
holiday_totals = (
    df.groupby(["Holiday_Flag", "Store"])
    ["Weekly_Sales"]
    .mean()
    .reset_index()
)
sns.barplot(data=holiday_totals, x="Store", y="Weekly_Sales", hue="Holiday_Flag")
plt.title("Average weekly sales by holiday flag")
plt.ylabel("Weekly Sales (USD)")
plt.tight_layout()

## 3. Reusing the Hydra static feature pipeline
The training script relies on `feature_engineering/walmart_full.yaml`. We can instantiate that static pipeline here to ensure parity between exploration and modelling.

In [None]:
feature_cfg = OmegaConf.load(FEATURE_CONFIG_PATH)
static_cfg = feature_cfg.get("static")
static_pipeline = instantiate(static_cfg)
features_df = static_pipeline.engineer_features(df_raw.copy())
features_df["Date"] = pd.to_datetime(features_df["Date"], dayfirst=True, format="mixed")
features_df.head()

### Feature importance proxy
A quick tree-based model gives a sense of which engineered signals matter most on a single store sample.

In [None]:
store_example = features_df[features_df["Store"] == 1].sort_values("Date")
feature_cols = [col for col in store_example.columns if col not in ["Weekly_Sales", "Date", "Store"]]
X = store_example[feature_cols].astype(float).values
y = store_example["Weekly_Sales"].values
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False).head(10)
plt.figure(figsize=(10, 4))
sns.barplot(x=importances.values, y=importances.index, orient="h")
plt.title("Top engineered features (Store 1)")
plt.xlabel("Importance")
plt.tight_layout()
importances.to_frame(name="importance")

## 4. Persist enriched features
Cache the engineered table so training notebooks or scripts can reuse the same features without recomputation.

In [None]:
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
features_df.to_parquet(FEATURE_CACHE, index=False)
print(f"Feature table saved to {FEATURE_CACHE}")