# Retail Data Lifecycle Mini Project
This notebook demonstrates an **end-to-end data lifecycle** using synthetic retail data.

Steps covered:
1. Capture (generate synthetic data)
2. Ingest (bronze layer)
3. Transform (silver layer)
4. Aggregate (gold layer)
5. Modeling (demand forecasting)
6. Forecasting next 7 days
7. Visualization

Outputs (`CSV` + `PNG`) can be imported into **Power BI** or any modern BI platform.

## 1. Setup & Libraries

In [None]:

import os
from pathlib import Path
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import joblib
import matplotlib.pyplot as plt

# Create directories
base = Path("retail_miniproject")
raw_dir = base / "raw"
bronze_dir = base / "bronze"
silver_dir = base / "silver"
gold_dir = base / "gold"
for d in [raw_dir, bronze_dir, silver_dir, gold_dir]:
    d.mkdir(parents=True, exist_ok=True)


## 2. Capture (Synthetic Data Generation)

In [None]:

np.random.seed(42)
start_date = datetime(2024, 1, 1)
days = 365
dates = [start_date + timedelta(days=i) for i in range(days)]
stores = ["store_A", "store_B"]
skus = [f"sku_{i}" for i in range(1,6)]

sales_rows, inventory_rows, customers, promo_rows = [], [], [], []

for date in dates:
    is_weekend = date.weekday() >= 5
    holiday_flag = 1 if (date.month == 1 and date.day == 26) or (date.month==12 and date.day in (25,26)) else 0
    for store in stores:
        for sku in skus:
            base = 20 + (hash(sku) % 10)
            seasonal = 5 * np.sin(2 * np.pi * (date.timetuple().tm_yday) / 365)
            noise = np.random.poisson(3)
            promo = np.random.binomial(1, 0.05)
            qty = max(0, int(base + seasonal - (5 if not promo else -3) + (5 if is_weekend else 0) + noise + (-8 if holiday_flag else 0)))
            price = round(100 + (hash(sku) % 30) - (5 if promo else 0), 2)
            sales_rows.append([date.strftime("%Y-%m-%d"), store, sku, qty, price, int(promo)])
            on_hand = max(0, 200 - np.random.randint(0,50) - qty)
            inventory_rows.append([date.strftime("%Y-%m-%d"), store, sku, on_hand])
    if np.random.rand() < 0.2:
        customers.append([f"cust_{np.random.randint(1,5000)}", date.strftime("%Y-%m-%d")])

promo_rows = [
    ["P1","sku_1","2024-02-01","2024-02-10",10],
    ["P2","sku_3","2024-11-20","2024-11-30",15],
]

sales_df = pd.DataFrame(sales_rows, columns=["date","store_id","sku","qty","unit_price","promo"])
inventory_df = pd.DataFrame(inventory_rows, columns=["snapshot_date","store_id","sku","on_hand"])
customers_df = pd.DataFrame(customers, columns=["customer_id","signup_date"]).drop_duplicates("customer_id")
promo_df = pd.DataFrame(promo_rows, columns=["promo_id","sku","start_date","end_date","discount_pct"])

# Save raw
sales_df.to_csv(raw_dir/"sales_raw.csv", index=False)
inventory_df.to_csv(raw_dir/"inventory_raw.csv", index=False)
customers_df.to_csv(raw_dir/"customers.csv", index=False)
promo_df.to_csv(raw_dir/"promotions.csv", index=False)

sales_df.head()


## 3. Ingest (Bronze Layer)

In [None]:

sales_bronze = pd.read_csv(raw_dir/"sales_raw.csv", parse_dates=["date"])
inventory_bronze = pd.read_csv(raw_dir/"inventory_raw.csv", parse_dates=["snapshot_date"])
promo_bronze = pd.read_csv(raw_dir/"promotions.csv", parse_dates=["start_date","end_date"])
customers_bronze = pd.read_csv(raw_dir/"customers.csv", parse_dates=["signup_date"])

# Save bronze copies
sales_bronze.to_csv(bronze_dir/"sales_bronze.csv", index=False)
inventory_bronze.to_csv(bronze_dir/"inventory_bronze.csv", index=False)
promo_bronze.to_csv(bronze_dir/"promo_bronze.csv", index=False)
customers_bronze.to_csv(bronze_dir/"customers_bronze.csv", index=False)

sales_bronze.head()


## 4. Transform (Silver Layer)

In [None]:

sales_silver = sales_bronze.copy()
sales_silver["revenue"] = sales_silver["qty"] * sales_silver["unit_price"]
sales_silver = sales_silver[sales_silver["qty"] >= 0]

sales_silver.to_csv(silver_dir/"sales_silver.csv", index=False)
inventory_bronze.to_csv(silver_dir/"inventory_silver.csv", index=False)
promo_bronze.to_csv(silver_dir/"promo_silver.csv", index=False)

sales_silver.head()


## 5. Aggregate (Gold Layer)

In [None]:

gold_sales = sales_silver.groupby(["date","store_id","sku"], as_index=False).agg(
    qty=("qty","sum"), revenue=("revenue","sum"), promo_count=("promo","sum")
)
gold_sales = gold_sales.merge(inventory_bronze, left_on=["date","store_id","sku"],
                              right_on=["snapshot_date","store_id","sku"], how="left")
gold_sales.drop(columns=["snapshot_date"], inplace=True)
gold_sales["on_hand"].fillna(gold_sales["on_hand"].median(), inplace=True)
gold_sales.to_csv(gold_dir/"daily_sales_gold.csv", index=False)

gold_sales.head()


## 6. Modeling (Demand Forecasting)

In [None]:

df = gold_sales.copy().sort_values(["store_id","sku","date"])
df["date"] = pd.to_datetime(df["date"])
lag_days = 7
for lag in range(1, lag_days+1):
    df[f"lag_{lag}"] = df.groupby(["store_id","sku"])["qty"].shift(lag)
df["dayofweek"] = df["date"].dt.weekday
df["month"] = df["date"].dt.month
df_model = df.dropna()

features = [f"lag_{i}" for i in range(1, lag_days+1)] + ["dayofweek","month","on_hand","promo_count"]
target = "qty"

cutoff_date = df_model["date"].max() - timedelta(days=28)
train = df_model[df_model["date"] <= cutoff_date]
test = df_model[df_model["date"] > cutoff_date]

X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]

model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("MAPE:", mean_absolute_percentage_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))

joblib.dump(model, base/"demand_model.joblib");


## 7. Forecasting Next 7 Days

In [None]:

last_date = gold_sales["date"].max()
future_dates = [pd.to_datetime(last_date) + timedelta(days=i) for i in range(1,8)]
forecast_rows = []

for store in gold_sales["store_id"].unique():
    for sku in gold_sales["sku"].unique():
        hist = gold_sales[(gold_sales.store_id==store)&(gold_sales.sku==sku)].sort_values("date")
        if hist.empty: continue
        recent = hist.tail(lag_days)
        lag_vals = list(recent["qty"].values)[-lag_days:][::-1]
        on_hand = recent["on_hand"].iloc[-1]
        for fd in future_dates:
            feat = {f"lag_{i}": lag_vals[i-1] for i in range(1,lag_days+1)}
            feat.update({"dayofweek": fd.weekday(), "month": fd.month, "on_hand": on_hand, "promo_count":0})
            pred = model.predict(pd.DataFrame([feat]))[0]
            pred_qty = max(0, int(round(pred)))
            forecast_rows.append([fd.strftime("%Y-%m-%d"), store, sku, pred_qty])
            lag_vals = [pred_qty] + lag_vals[:-1]
            on_hand = max(0, on_hand - pred_qty)

forecast_df = pd.DataFrame(forecast_rows, columns=["date","store_id","sku","forecast_qty"])
forecast_df.to_csv(gold_dir/"forecast_daily.csv", index=False)
forecast_df.head()


## 8. Visualization

In [None]:

sel_store, sel_sku = "store_A", "sku_1"
hist_plot = gold_sales[(gold_sales.store_id==sel_store)&(gold_sales.sku==sel_sku)].sort_values("date")
last_60 = hist_plot.tail(60)
future_plot = forecast_df[(forecast_df.store_id==sel_store)&(forecast_df.sku==sel_sku)]

plt.figure(figsize=(10,5))
plt.plot(pd.to_datetime(last_60["date"]), last_60["qty"], label="historical_qty")
plt.plot(pd.to_datetime(future_plot["date"]), future_plot["forecast_qty"], label="forecast_qty", linestyle="--")
plt.xlabel("date"); plt.ylabel("quantity")
plt.title(f"Historical vs Forecast ({sel_store}/{sel_sku})")
plt.legend()
plt.show()


## 9. Outputs for Power BI
- `gold/daily_sales_gold.csv`: Daily aggregated sales (for dashboards)
- `gold/forecast_daily.csv`: Forecasts for next 7 days

These CSVs can be imported directly into **Power BI** for building dashboards.