# Step 1 — Input Ingestion: Synthetic Data Foundation

**CSAO Rail Recommendation System · Zomathon Hackathon**

This notebook loads and verifies the synthetic data generated by `generate_data.py`.

| Dataset | Records | Key Fields |
|---------|---------|------------|
| `restaurants.csv` | 100 | cuisine, zone, price_tier, discount_thresholds |
| `menu_items.csv` | ~2000 | category, subcategory, veg_flag, is_combo, margin_pct |
| `users.csv` | 5000 | segment, dietary_preference, veg_days, RFM metrics |
| `order_history.csv` | 30000 | 5-week temporal span, segment-consistent spending |
| `sessions.csv` | 15000 | dietary_toggle, order_completed, cart size |
| `cart_events.csv` | ~287K | sequential events, 8-12% recommendation acceptance |

In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta

DATA = "../data"
restaurants = pd.read_csv(f"{DATA}/restaurants.csv")
menu        = pd.read_csv(f"{DATA}/menu_items.csv")
users       = pd.read_csv(f"{DATA}/users.csv")
orders      = pd.read_csv(f"{DATA}/order_history.csv")
sessions    = pd.read_csv(f"{DATA}/sessions.csv")
events      = pd.read_csv(f"{DATA}/cart_events.csv")

print(f"Restaurants : {len(restaurants):>6,}")
print(f"Menu items  : {len(menu):>6,}")
print(f"Users       : {len(users):>6,}")
print(f"Orders      : {len(orders):>6,}")
print(f"Sessions    : {len(sessions):>6,}")
print(f"Cart events : {len(events):>6,}")

## 1. Restaurant Distribution

In [None]:
print("=== Cuisine Distribution ===")
print(restaurants["primary_cuisine"].value_counts().to_string())
print(f"\n=== Price Tier Distribution ===")
print(restaurants["price_tier"].value_counts().to_string())
print(f"\n=== City Distribution ===")
print(restaurants["city"].value_counts().to_string())
print(f"\n=== Discount Thresholds ===")
has_free_del = (restaurants["free_delivery_min"] > 0).sum()
has_disc = restaurants["discount_thresholds"].apply(lambda x: len(json.loads(x)) > 0).sum()
print(f"  Restaurants with free delivery threshold: {has_free_del}")
print(f"  Restaurants with discount offers: {has_disc}")
print(f"\n=== Sample Restaurants ===")
restaurants[["restaurant_id", "name", "city", "primary_cuisine", "price_tier", "rating"]].head(10)

## 2. Menu Items Analysis

In [None]:
print("=== Category Distribution ===")
print(menu["category"].value_counts().to_string())

print(f"\n=== Veg / Non-Veg Split ===")
veg = menu["veg_flag"].sum()
print(f"  Veg: {veg}  |  Non-Veg: {len(menu) - veg}")

print(f"\n=== Combo Items ===")
combos = menu[menu["is_combo"] == True]
print(f"  Total combos: {len(combos)}")
print(f"  Sample combos:")
for _, c in combos.head(5).iterrows():
    print(f"    {c['name']} (Rs {c['price']}) - {c['cuisine_tag']}")
    print(f"      Components: {c['combo_components']}")

print(f"\n=== Price Statistics ===")
print(menu.groupby("category")["price"].agg(["mean", "min", "max"]).round(0).to_string())

print(f"\n=== Margin Distribution ===")
print(f"  Below 10%: {(menu['margin_pct'] < 10).sum()} items (will be filtered in Step 2)")
print(f"  10-30%:    {((menu['margin_pct'] >= 10) & (menu['margin_pct'] < 30)).sum()}")
print(f"  30-50%:    {((menu['margin_pct'] >= 30) & (menu['margin_pct'] < 50)).sum()}")
print(f"  50%+:      {(menu['margin_pct'] >= 50).sum()}")

print(f"\n=== Availability ===")
print(f"  In stock: {menu['availability'].sum()}  |  Out of stock: {(~menu['availability']).sum()}")
print(f"  Bestsellers: {menu['bestseller_flag'].sum()}")

## 3. User Segments & Preferences

In [None]:
print("=== Segment Distribution ===")
print(users["segment"].value_counts().to_string())

print(f"\n=== Dietary Preferences ===")
print(users["dietary_preference"].value_counts().to_string())

print(f"\n=== Users with Veg Days Set ===")
has_veg_days = users["veg_days"].apply(lambda x: len(json.loads(x)) > 0).sum()
print(f"  {has_veg_days} users ({has_veg_days/len(users)*100:.1f}%) have veg days configured")

print(f"\n=== RFM Metrics by Segment ===")
rfm_cols = ["rfm_recency", "rfm_frequency", "rfm_monetary", "avg_order_value", "order_count"]
print(users.groupby("segment")[rfm_cols].mean().round(0).to_string())

print(f"\n=== Cold Start Candidates (Occasional segment) ===")
occasional = users[users["segment"] == "Occasional"]
print(f"  Count: {len(occasional)}")
print(f"  Avg order count: {occasional['order_count'].mean():.1f}")
print(f"  Avg recency (days): {occasional['rfm_recency'].mean():.0f}")

## 4. Order History & Temporal Split

In [None]:
START = datetime(2025, 12, 1)
orders["order_dt"] = pd.to_datetime(orders["order_time"])

w3 = START + timedelta(weeks=3)
w4 = START + timedelta(weeks=4)

train = orders[orders["order_dt"] < w3]
val   = orders[(orders["order_dt"] >= w3) & (orders["order_dt"] < w4)]
test  = orders[orders["order_dt"] >= w4]

print("=== Temporal Split (no future leakage) ===")
print(f"  Train (weeks 1-3) : {len(train):>6,} orders  ({len(train)/len(orders)*100:.1f}%)")
print(f"  Val   (week 4)    : {len(val):>6,} orders  ({len(val)/len(orders)*100:.1f}%)")
print(f"  Test  (week 5)    : {len(test):>6,} orders  ({len(test)/len(orders)*100:.1f}%)")

print(f"\n=== Order Completion ===")
print(f"  Completed: {orders['was_completed'].sum():,} ({orders['was_completed'].mean()*100:.1f}%)")

print(f"\n=== Order Value by Meal Period ===")
print(orders.groupby("meal_period")["order_value"].agg(["count", "mean"]).round(0).to_string())

print(f"\n=== Meal Period Distribution ===")
print(orders["meal_period"].value_counts().to_string())

## 5. Sessions & Cart Events — Recommendation Acceptance

In [None]:
print("=== Session Completion ===")
s_comp = sessions["order_completed"].mean() * 100
print(f"  Completed: {sessions['order_completed'].sum():,} ({s_comp:.1f}%)")
print(f"  Avg cart items: {sessions['num_cart_items'].mean():.1f}")

print(f"\n=== Dietary Toggle Usage ===")
print(sessions["dietary_toggle"].value_counts().to_string())

print(f"\n=== Cart Events Breakdown ===")
recs = events[events["was_recommendation"] == True]
organic = events[events["was_recommendation"] == False]
accepted = recs[recs["was_accepted"] == True]
rejected = recs[recs["was_accepted"] == False]

print(f"  Total events       : {len(events):>8,}")
print(f"  Organic adds       : {len(organic):>8,}")
print(f"  Recs shown         : {len(recs):>8,}")
print(f"  Recs accepted      : {len(accepted):>8,}")
print(f"  Recs rejected      : {len(rejected):>8,}")

rate = len(accepted) / len(recs) * 100
print(f"\n  >>> Acceptance rate: {rate:.1f}% (target: 8-12%) {'OK' if 8 <= rate <= 12 else 'OUT OF RANGE'}")

print(f"\n=== Acceptance by Position Shown ===")
pos_stats = recs.groupby("position_shown")["was_accepted"].mean() * 100
for pos, pct in pos_stats.items():
    bar = "#" * int(pct * 2)
    print(f"  Position {int(pos):>2}: {pct:>5.1f}% {bar}")

## 6. Sample Session Trace

Reconstructing a single session to verify sequential cart event logic:

In [None]:
sid = sessions[sessions["num_cart_items"] >= 4].iloc[0]["session_id"]
sess = sessions[sessions["session_id"] == sid].iloc[0]
sess_events = events[events["session_id"] == sid].sort_values("timestamp")

print(f"Session: {sid}")
print(f"  User: {sess['user_id']}  |  Restaurant: {sess['restaurant_id']}")
print(f"  Meal: {sess['meal_period']}  |  Zone: {sess['zone']}  |  Diet toggle: {sess['dietary_toggle']}")
print(f"  Completed: {sess['order_completed']}  |  Value: Rs {sess['final_order_value']}")
print(f"  Cart items: {sess['num_cart_items']}")
print(f"\n  {'Event':<12} {'Item':<8} {'Type':<14} {'Pos':>4} {'CartPos':>8} {'Accepted':<10}")
print("  " + "-" * 60)
for _, e in sess_events.iterrows():
    etype = "Organic add" if not e["was_recommendation"] else ("Rec ACCEPT" if e["was_accepted"] else "Rec shown")
    pos = f"{int(e['position_shown'])}" if pd.notna(e["position_shown"]) else "-"
    cpos = f"{int(e['cart_position'])}" if pd.notna(e["cart_position"]) else "-"
    acc = str(e["was_accepted"]) if pd.notna(e["was_accepted"]) else "-"
    item_name = menu[menu["item_id"] == e["item_id"]]["name"].values
    iname = item_name[0][:25] if len(item_name) > 0 else e["item_id"]
    print(f"  {e['event_id']:<12} {iname:<28} {etype:<14} {pos:>4} {cpos:>8} {acc:<10}")

## 7. Data Quality Summary

Final checklist for Step 1 verification:

In [None]:
checks = []

# 1. Acceptance rate
rate = len(accepted) / len(recs) * 100
checks.append(("Acceptance rate 8-12%", 8 <= rate <= 12, f"{rate:.1f}%"))

# 2. Segment distribution
seg_counts = users["segment"].value_counts(normalize=True)
checks.append(("Budget ~30%", 0.25 <= seg_counts.get("Budget", 0) <= 0.35, f"{seg_counts.get('Budget',0)*100:.1f}%"))
checks.append(("Occasional ~25%", 0.20 <= seg_counts.get("Occasional", 0) <= 0.30, f"{seg_counts.get('Occasional',0)*100:.1f}%"))

# 3. Temporal split integrity
checks.append(("Train set > 50%", len(train)/len(orders) > 0.50, f"{len(train)/len(orders)*100:.1f}%"))
checks.append(("Val+Test exist", len(val) > 0 and len(test) > 0, f"val={len(val)}, test={len(test)}"))

# 4. Combos present
checks.append(("Combos/thalis exist", menu["is_combo"].sum() > 0, f"{int(menu['is_combo'].sum())} combos"))

# 5. Items with low margin (for filtering)
checks.append(("Low-margin items exist", (menu["margin_pct"] < 10).sum() > 0, f"{(menu['margin_pct'] < 10).sum()} items"))

# 6. Out-of-stock items exist
checks.append(("Out-of-stock items exist", (~menu["availability"]).sum() > 0, f"{(~menu['availability']).sum()} items"))

# 7. Dietary toggles used
checks.append(("Dietary toggles active", (sessions["dietary_toggle"] != "none").sum() > 0,
               f"{(sessions['dietary_toggle'] != 'none').sum()} sessions"))

# 8. Veg days on profiles
has_vd = users["veg_days"].apply(lambda x: len(json.loads(x)) > 0).sum()
checks.append(("Profile veg days set", has_vd > 0, f"{has_vd} users"))

# 9. Multiple cuisines
checks.append(("10 cuisines present", restaurants["primary_cuisine"].nunique() == 10,
               f"{restaurants['primary_cuisine'].nunique()} cuisines"))

# 10. Session completion reasonable
checks.append(("Session completion 60-90%", 0.60 <= sessions["order_completed"].mean() <= 0.90,
               f"{sessions['order_completed'].mean()*100:.1f}%"))

print("=" * 62)
print("  STEP 1 DATA QUALITY CHECKLIST")
print("=" * 62)
all_ok = True
for name, passed, detail in checks:
    status = "PASS" if passed else "FAIL"
    if not passed:
        all_ok = False
    print(f"  [{status}] {name:<35} {detail}")
print("=" * 62)
if all_ok:
    print("  All checks passed. Data ready for Step 2.")
else:
    print("  Some checks failed. Review above.")
print("=" * 62)