<a href="https://colab.research.google.com/github/Jana-Alrzoog/2025_GP_28/blob/main/masar-sim/notebooks/masar_occupancy_monthly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# **Masar Occupancy — Month Generator**

Generates **minute-level passenger occupancy** for a full **calendar month** across selected stations/lines.  
Starts from a **base-day curve** and applies **context modifiers** (station capacity, weekend, weather, and events; holidays configurable),  
then exports tidy CSVs for dashboards and Firestore/pipeline publishing.

---

### 🎯 Purpose
- Produce **month-long time series** at **1-minute resolution**.
- Robustly fill all calendar days and keep station/event consistency.
- Output **validated CSVs**: per-day (optional) and a **consolidated monthly file**.

---

### 🧩 Inputs
`base_day.csv` (or `day_base.csv`), Seeds: `stations`, `events`, `holidays` (optional), `weather`, Config: `00_config.yaml`.

---

### ⚙️ Workflow
1) Load config & seeds.  
2) Select month window (e.g., `2025-09`).  
3) Build minute grid per day & station.  
4) Compute modifiers (weekend, weather, events, holiday toggle).  
5) Map to `station_total` by capacity; derive `crowd_level`.  
6) QA & export consolidated monthly CSV (and optional per-day files).

---


In [1]:
%cd /content
!git clone https://github.com/Jana-Alrzoog/2025_GP_28.git
%cd /content/2025_GP_28/masar-sim
!ls


/content
Cloning into '2025_GP_28'...
remote: Enumerating objects: 716, done.[K
remote: Counting objects: 100% (231/231), done.[K
remote: Compressing objects: 100% (224/224), done.[K
remote: Total 716 (delta 120), reused 1 (delta 1), pack-reused 485 (from 1)[K
Receiving objects: 100% (716/716), 8.11 MiB | 7.84 MiB/s, done.
Resolving deltas: 100% (267/267), done.
/content/2025_GP_28/masar-sim
data  lib  notebooks  sims


In [2]:
# =========================================================
# Generate a full month with changing scenarios
# =========================================================

import os, json, csv, yaml
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from dateutil.parser import parse

CANDIDATES = [
    "/content/2025_GP_28_latest/masar-sim",
    "/content/2025_GP_28/masar-sim",
    "/content/masar-sim",
]
ROOT = next((p for p in CANDIDATES if os.path.exists(p)), None)
assert ROOT, "لم أجد مجلد masar-sim. تأكدي من الكلون والمسار."
SEED = f"{ROOT}/data/seeds"
GEN  = f"{ROOT}/data/generated"
CONF = f"{ROOT}/sims/00_config.yaml"

print("ROOT =", ROOT)
print("GEN  =", GEN)
print("CONF =", CONF)

with open(CONF) as f:
    config = yaml.safe_load(f)

with open(f"{SEED}/stations.json") as f:
    stations = json.load(f)
with open(f"{SEED}/weather_patterns.json") as f:
    weather_map = json.load(f)
with open(f"{SEED}/calendar_events.csv") as f:
    events_seed = list(csv.DictReader(f))


base_path = f"{GEN}/base_day.csv"
assert os.path.exists(base_path), "base_day.csv غير موجود—شغّلي masar_base_demand.ipynb أولًا."
base_day = pd.read_csv(base_path, parse_dates=["timestamp"])
print(f"base_day rows={len(base_day):,}, stations={base_day['station_id'].nunique()}, day={base_day['timestamp'].dt.date.iloc[0]}")


ROOT = /content/2025_GP_28/masar-sim
GEN  = /content/2025_GP_28/masar-sim/data/generated
CONF = /content/2025_GP_28/masar-sim/sims/00_config.yaml
base_day rows=6,486, stations=6, day=2025-09-24


In [None]:
# ============================================================
# Masar Occupancy — Month Generator (September 2025)
# - Builds minute-level demand for 2025-09-01 → 2025-09-30
# - Event types are read from calendar_events.csv per date/station
# - Holidays can be toggled (default OFF)
# - Saves consolidated monthly CSV (+ optional per-day files)
# ============================================================

import os, csv, json, yaml
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# -------------------- 0) Paths & config --------------------
ROOT = "/content/2025_GP_28/masar-sim"
SEED = f"{ROOT}/data/seeds"
CONF = f"{ROOT}/sims/00_config.yaml"
OUT_DIR = f"{ROOT}/data/generated"
os.makedirs(OUT_DIR, exist_ok=True)

with open(CONF, "r", encoding="utf-8") as f:
    config = yaml.safe_load(f) or {}

# Month window (inclusive)
MONTH = "2025-09"
MONTH_START = pd.Timestamp(f"{MONTH}-01")
MONTH_END   = pd.Timestamp(f"{MONTH}-30")  # 30 days in Sep 2025

HOLIDAYS_ON = False         # set True to enable holiday multipliers
SAVE_DAILY  = False         # set True if you also want per-day CSVs

# -------------------- 1) Load base-day template --------------------
# We only need a single-day base grid: station_id, minute_of_day, base_demand
candidates = [
    f"{OUT_DIR}/day_base.csv",             # recommended new name
    f"{OUT_DIR}/base_day.csv",             # your current file
    f"{ROOT}/data/base/day_base.csv",
    f"{ROOT}/data/base/base_day.csv",
    f"{OUT_DIR}/day_demand_base.csv",      # legacy
    f"{ROOT}/data/base/day_demand_base.csv",
]
src = next((p for p in candidates if os.path.exists(p)), None)
if src is None:
    raise FileNotFoundError("No base-day CSV found. Expected one of:\n" + "\n".join(candidates))
base_day = pd.read_csv(src)
print("Loaded base-day from:", src)

# Normalize headers
base_day.columns = [str(c).strip().lower() for c in base_day.columns]
rename_map = {
    "station":"station_id","station code":"station_id","station_code":"station_id","sid":"station_id",
    "base":"base_demand","basedemand":"base_demand","demand_base":"base_demand",
    "base_day_demand":"base_demand","base_day":"base_demand",
    "minute":"minute_of_day","min":"minute_of_day","minuteofday":"minute_of_day","minute-of-day":"minute_of_day",
}
base_day = base_day.rename(columns=rename_map)

# Build minute_of_day if missing
if "minute_of_day" not in base_day.columns:
    if {"hour","minute"}.issubset(base_day.columns):
        base_day["minute_of_day"] = (pd.to_numeric(base_day["hour"], errors="coerce").fillna(0).astype(int)*60 +
                                     pd.to_numeric(base_day["minute"], errors="coerce").fillna(0).astype(int))
    elif "time" in base_day.columns:
        t = pd.to_datetime(base_day["time"], errors="coerce")
        base_day["minute_of_day"] = (t.dt.hour*60 + t.dt.minute).astype("Int64").fillna(0).astype(int)
    elif "timestamp" in base_day.columns:
        ts = pd.to_datetime(base_day["timestamp"], errors="coerce")
        base_day["minute_of_day"] = (ts.dt.hour*60 + ts.dt.minute).astype("Int64").fillna(0).astype(int)
    else:
        # best-effort fallback (if rows are minute-ordered)
        base_day = base_day.reset_index().rename(columns={"index":"minute_of_day"})
        base_day["minute_of_day"] = base_day["minute_of_day"].clip(0, 1439).astype(int)
for c in ["station_id","base_demand"]:
    if c not in base_day.columns:
        raise KeyError(f"Missing required column '{c}' in base-day template.")
base_day["station_id"]  = base_day["station_id"].astype(str).str.strip()
base_day["base_demand"] = pd.to_numeric(base_day["base_demand"], errors="coerce").fillna(0.0)
base_day["minute_of_day"] = pd.to_numeric(base_day["minute_of_day"], errors="coerce").fillna(0).astype(int)

# -------------------- 2) Stations & capacities --------------------
def _norm(x): return str(x).strip().upper()
with open(f"{SEED}/stations.json", "r", encoding="utf-8") as f:
    stations_list = json.load(f)
capacity_df = pd.DataFrame(stations_list)[["station_id","capacity_station"]]
sid_by_code, sid_by_name = {}, {}
for st in stations_list:
    sid  = str(st.get("station_id","")).strip()
    code = str(st.get("code","")).strip()
    name = str(st.get("name","")).strip()
    if code: sid_by_code[_norm(code)] = sid
    if name: sid_by_name[_norm(name)] = sid

ALIASES = {
    "AIRPORT T1-2": "AIRP_T12",
    "QASR AL-HOKM": "QASR",
    "NATIONAL MUSEUM": "MUSEUM",
    "WESTERN STATION": "S6",
}
def resolve_sid(token: str):
    t = _norm(token)
    if t in sid_by_code: return sid_by_code[t]
    if t in sid_by_name: return sid_by_name[t]
    if t in ALIASES:
        c = _norm(ALIASES[t]); return sid_by_code.get(c, ALIASES[t])
    return None

# -------------------- 3) Calendar (events & holidays) --------------------
def norm_date(x: str) -> str:
    if x is None: return ""
    s = str(x).strip()
    if not s: return ""
    d = pd.to_datetime(s, errors="coerce", dayfirst=False)
    if pd.isna(d):
        d = pd.to_datetime(s, errors="coerce", dayfirst=True)
    return "" if pd.isna(d) else d.strftime("%Y-%m-%d")

# Events
events_csv = f"{SEED}/calendar_events.csv"
event_rows = []
with open(events_csv, "r", encoding="utf-8") as f:
    rdr = csv.DictReader(f)
    cols = {c.lower().strip(): c for c in rdr.fieldnames}
    for r in rdr:
        event_rows.append({
            "date": norm_date(r.get(cols.get("date","date"), "")),
            "event_type": (r.get(cols.get("event_type","event_type")) or r.get(cols.get("type","type")) or "Other").strip(),
            "stations_impacted": (r.get(cols.get("stations_impacted","stations_impacted")) or r.get(cols.get("stations","stations")) or "*").strip(),
            "demand_modifier": float((r.get(cols.get("demand_modifier","demand_modifier")) or "1.0")),
        })

GLOBAL_EVENT_TYPES = {"SaudiNationalDay"}
event_types_map = {}              # (date, SID) -> set(types)
event_mult_override = {}          # (date, SID) -> product(mods)
global_event_types_by_date = {}   # date -> set(types)
global_event_mult_by_date  = {}   # date -> product(mods)

for e in event_rows:
    d = e["date"]
    if not d: continue
    etype = e["event_type"] or "Other"
    dm    = float(e.get("demand_modifier", 1.0) or 1.0)
    tokens = [s.strip() for s in (e["stations_impacted"] or "*").split(";")]

    is_global = (etype in GLOBAL_EVENT_TYPES) or any(_norm(t) in {"*","ALL","ALL STATIONS"} for t in tokens)
    if is_global:
        global_event_types_by_date.setdefault(d, set()).add(etype)
        global_event_mult_by_date[d] = global_event_mult_by_date.get(d, 1.0) * dm

    for tok in tokens:
        if tok == "" or _norm(tok) in {"*","ALL","ALL STATIONS"}: continue
        sid = resolve_sid(tok)
        if sid is None:
            print(f"[warn] Unknown station alias in events CSV: '{tok}'")
            continue
        key = (d, _norm(sid))
        event_types_map.setdefault(key, set()).add(etype)
        event_mult_override[key] = event_mult_override.get(key, 1.0) * dm

# Holidays
holiday_dates = set()
if HOLIDAYS_ON:
    holidays_csv = f"{SEED}/holidays.csv"
    if os.path.exists(holidays_csv):
        with open(holidays_csv, "r", encoding="utf-8") as f:
            rdr = csv.DictReader(f)
            cols = {c.lower().strip(): c for c in rdr.fieldnames}
            for r in rdr:
                d = norm_date(r.get(cols.get("date","date"), ""))
                if d: holiday_dates.add(d)

def list_event_types(date_str, sid):
    sidn = _norm(sid)
    types = set()
    if (date_str, sidn) in event_types_map:
        types |= event_types_map[(date_str, sidn)]
    if date_str in global_event_types_by_date:
        types |= global_event_types_by_date[date_str]
    return sorted(types)

def event_csv_multiplier(date_str, sid):
    sidn = _norm(sid)
    m = 1.0
    if (date_str, sidn) in event_mult_override:
        m *= event_mult_override[(date_str, sidn)]
    if date_str in global_event_mult_by_date:
        m *= global_event_mult_by_date[date_str]
    return float(m)

# -------------------- 4) Multipliers --------------------
mult_cfg     = (config.get("multipliers", {}) or {})
weather_mult = mult_cfg.get("weather", {}) or {}
events_mult  = mult_cfg.get("events", {}) or {}
weekend_mult = float(mult_cfg.get("weekend", 1.0))
holiday_mult = float(mult_cfg.get("holiday", 1.0)) if HOLIDAYS_ON else 1.0
COMBINE_MODE = "stack"  # multiply components

def build_modifier(row):
    m = 1.0
    # weekend
    if int(row.get("is_weekend",0)) == 1:
        m *= weekend_mult

    # holiday (optional)
    hol_m = holiday_mult if row["date"] in holiday_dates else 1.0

    # events
    ev_m = event_csv_multiplier(row["date"], row["station_id"])
    if ev_m == 1.0:
        tmp = 1.0
        for t in list_event_types(row["date"], row["station_id"]):
            tmp *= float(events_mult.get(t, events_mult.get("Other", 1.0)))
        ev_m = tmp if tmp != 1.0 else 1.0

    m = m * hol_m * ev_m if COMBINE_MODE == "stack" else m * max(hol_m, ev_m)

    # weather
    w = str(row.get("weather_code", "") or "")
    m *= float(weather_mult.get(w, 1.0))
    return float(m)

# -------------------- 5) Build the full month grid --------------------
dates = pd.date_range(MONTH_START, MONTH_END, freq="D")
frames = []
for d in dates:
    df_d = base_day.copy()
    h = (df_d["minute_of_day"] // 60).astype(int)
    m = (df_d["minute_of_day"] %  60).astype(int)
    date_iso = d.strftime("%Y-%m-%d")
    df_d["date"]        = date_iso
    df_d["timestamp"]   = pd.to_datetime(f"{date_iso} " + h.astype(str).str.zfill(2) + ":" + m.astype(str).str.zfill(2) + ":00")
    df_d["hour"]        = h
    df_d["day_of_week"] = d.weekday()
    df_d["is_weekend"]  = df_d["day_of_week"].isin([4,5]).astype(int)  # Fri=4, Sat=5
    frames.append(df_d)

df = pd.concat(frames, ignore_index=True).sort_values(["date","station_id","minute_of_day"]).reset_index(drop=True)

# -------------------- 6) Apply modifiers & map to capacity --------------------
df["modifier"] = df.apply(build_modifier, axis=1)
base_demand_safe = pd.to_numeric(df.get("base_demand", 0), errors="coerce").fillna(0)
df["demand_final"] = (base_demand_safe * pd.to_numeric(df["modifier"], errors="coerce").fillna(1.0)).fillna(0)

# capacities
df = df.merge(capacity_df, on="station_id", how="left")

# normalize by GLOBAL monthly max (for realistic network-wide scaling)
global_max = max(df["demand_final"].max(), 1e-9)
df["_denom"] = global_max
df["demand_norm_final"] = (df["demand_final"] / df["_denom"]).clip(0, 1)

def station_total_from_norm(row):
    cap = float(row.get("capacity_station") or 0)
    if cap <= 0: return 0
    norm = float(row["demand_norm_final"])
    evb  = event_csv_multiplier(row["date"], row["station_id"])
    boost = min(1.10, 1.0 if evb <= 1.0 else min(evb, 1.10))  # cap +10% on event boost
    return int(np.round(norm * cap * boost))

df["station_total"] = df.apply(station_total_from_norm, axis=1).astype(int)

def crowd_from_cap(row):
    cap = float(row.get("capacity_station") or 0)
    x = float(row.get("station_total") or 0)
    if cap <= 0: return "Medium"
    r = x / cap
    if   r < 0.30: return "Low"
    elif r < 0.60: return "Medium"
    elif r < 0.85: return "High"
    else:          return "Extreme"
df["crowd_level"] = df.apply(crowd_from_cap, axis=1)

# flags / types (read from calendar CSV)
df["special_event_type"] = df.apply(lambda r: "+".join(list_event_types(r["date"], r["station_id"])) or "None", axis=1)
df["event_flag"]   = (df["special_event_type"] != "None").astype(int)
df["holiday_flag"] = df["date"].isin(holiday_dates).astype(int) if HOLIDAYS_ON else 0

# headways
headway_cfg = config.get("headway", {})
peaks_cfg   = config.get("peaks", [])
peak_hours  = [int(x.get("hour")) for x in peaks_cfg if "hour" in x]
peak_hw_min    = float(np.median(headway_cfg.get("peak_pattern",    [7,7,6,8])))
offpeak_hw_min = float(np.median(headway_cfg.get("offpeak_pattern", [11,10,12,11])))
def hw_for_hour(h): return int(peak_hw_min*60) if int(h) in peak_hours else int(offpeak_hw_min*60)
df["headway_seconds"] = df.get("headway_seconds")
df["headway_seconds"] = pd.to_numeric(df["headway_seconds"], errors="coerce")
mask = df["headway_seconds"].isna()
df.loc[mask, "headway_seconds"] = df.loc[mask, "hour"].apply(hw_for_hour)
df["headway_seconds"] = df["headway_seconds"].astype(int)

# -------------------- 7) Output (monthly + optional daily) --------------------
FINAL_SCHEMA = [
    "date","timestamp","hour","minute_of_day","day_of_week","is_weekend",
    "station_id",
    "base_demand","modifier","demand_final",
    "station_total","crowd_level",
    "special_event_type","event_flag","holiday_flag",
    "headway_seconds"
]
for c in FINAL_SCHEMA:
    if c not in df.columns:
        df[c] = np.nan
out = df[FINAL_SCHEMA].sort_values(["date","station_id","minute_of_day"]).reset_index(drop=True)

# QA
assert out["station_id"].notna().all()
assert (out["station_total"] >= 0).all()

# Save monthly file
OUT_MONTH = f"{OUT_DIR}/cf_month_{MONTH}.csv"
out.to_csv(OUT_MONTH, index=False, encoding="utf-8-sig")
print("Saved ✓", OUT_MONTH, "| Rows:", len(out), "| Dates:", out['date'].min(), "→", out['date'].max())

# (Optional) per-day files
if SAVE_DAILY:
    for d, g in out.groupby("date", sort=True):
        p = f"{OUT_DIR}/cf_day_{d}.csv"
        g.to_csv(p, index=False, encoding="utf-8-sig")
    print("Daily CSVs saved in:", OUT_DIR)


Loaded base-day from: /content/2025_GP_28/masar-sim/data/generated/base_day.csv
Saved ✓ /content/2025_GP_28/masar-sim/data/generated/cf_month_2025-09.csv | Rows: 194580 | Dates: 2025-09-01 → 2025-09-30


هيا اللي تحت الكود اللي تشغلينه مو اللي فوق

In [5]:
# ============================================================
# Masar Occupancy — Month Generator (September 2025)
# - Builds minute-level demand for 2025-09-01 → 2025-09-30
# - Reads event types from calendar_events.csv per date/station
# - Converts normalized base-day demand into pax/min using
#   service capacity (headway, train capacity, directions)
# - Computes minute-level station occupancy (concourse+platform)
# - Saves consolidated monthly CSV (+ optional per-day files)
# ============================================================

import os, csv, json, yaml
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# -------------------- 0) Paths & config --------------------
ROOT = "/content/2025_GP_28/masar-sim"
SEED = f"{ROOT}/data/seeds"
CONF = f"{ROOT}/sims/00_config.yaml"
OUT_DIR = f"{ROOT}/data/generated"
os.makedirs(OUT_DIR, exist_ok=True)

config = {}
if os.path.exists(CONF):
    with open(CONF, "r", encoding="utf-8") as f:
        config = yaml.safe_load(f) or {}

# Month window (inclusive)
MONTH = "2025-09"
MONTH_START = pd.Timestamp(f"{MONTH}-01")
MONTH_END   = pd.Timestamp(f"{MONTH}-30")  # 30 days in Sep 2025

HOLIDAYS_ON = False        # set True to enable holiday multipliers
SAVE_DAILY  = False        # set True if you also want per-day CSVs

# -------------------- 1) Load base-day template --------------------
# We only need a single-day base grid: station_id, minute_of_day, base_demand (0..1)
candidates = [
    f"{OUT_DIR}/day_base.csv",             # recommended new name
    f"{OUT_DIR}/base_day.csv",             # your current file
    f"{ROOT}/data/base/day_base.csv",
    f"{ROOT}/data/base/base_day.csv",
    f"{OUT_DIR}/day_demand_base.csv",      # legacy
    f"{ROOT}/data/base/day_demand_base.csv",
]
src = next((p for p in candidates if os.path.exists(p)), None)
if src is None:
    raise FileNotFoundError("No base-day CSV found. Expected one of:\n" + "\n".join(candidates))

base_day = pd.read_csv(src)
print("Loaded base-day from:", src)

# Normalize headers
base_day.columns = [str(c).strip().lower() for c in base_day.columns]
rename_map = {
    "station":"station_id","station code":"station_id","station_code":"station_id","sid":"station_id",
    "base":"base_demand","basedemand":"base_demand","demand_base":"base_demand",
    "base_day_demand":"base_demand","base_day":"base_demand",
    "minute":"minute_of_day","min":"minute_of_day","minuteofday":"minute_of_day","minute-of-day":"minute_of_day",
}
base_day = base_day.rename(columns=rename_map)

# Build minute_of_day if missing
if "minute_of_day" not in base_day.columns:
    if {"hour","minute"}.issubset(base_day.columns):
        base_day["minute_of_day"] = (
            pd.to_numeric(base_day["hour"], errors="coerce").fillna(0).astype(int)*60 +
            pd.to_numeric(base_day["minute"], errors="coerce").fillna(0).astype(int)
        )
    elif "time" in base_day.columns:
        t = pd.to_datetime(base_day["time"], errors="coerce")
        base_day["minute_of_day"] = (t.dt.hour*60 + t.dt.minute).astype("Int64").fillna(0).astype(int)
    elif "timestamp" in base_day.columns:
        ts = pd.to_datetime(base_day["timestamp"], errors="coerce")
        base_day["minute_of_day"] = (ts.dt.hour*60 + ts.dt.minute).astype("Int64").fillna(0).astype(int)
    else:
        base_day = base_day.reset_index().rename(columns={"index":"minute_of_day"})
        base_day["minute_of_day"] = base_day["minute_of_day"].clip(0, 1439).astype(int)

for c in ["station_id","base_demand","minute_of_day"]:
    if c not in base_day.columns:
        raise KeyError(f"Missing required column '{c}' in base-day template.")

base_day["station_id"]     = base_day["station_id"].astype(str).str.strip()
base_day["base_demand"]    = pd.to_numeric(base_day["base_demand"], errors="coerce").fillna(0.0)  # 0..1 normalized
base_day["minute_of_day"]  = pd.to_numeric(base_day["minute_of_day"], errors="coerce").fillna(0).astype(int)

# -------------------- 2) Stations & capacities --------------------
def _norm(x): return str(x).strip().upper()

with open(f"{SEED}/stations.json", "r", encoding="utf-8") as f:
    stations_list = json.load(f)

stations_df = pd.json_normalize(stations_list)
# Expected fields: station_id, capacity_station, capacity_platform, served_lines (list),
# train_capacity.train_total (or train_capacity with keys)

# Fallbacks
if "capacity_station" not in stations_df.columns:
    stations_df["capacity_station"] = 2000
if "capacity_platform" not in stations_df.columns:
    stations_df["capacity_platform"] = 1500

# Served lines count
if "served_lines" in stations_df.columns:
    served_counts = stations_df["served_lines"].apply(lambda v: len(v) if isinstance(v, (list,tuple)) else 1)
else:
    served_counts = 1

# Train total capacity
if "train_capacity.train_total" in stations_df.columns:
    train_total_cap = pd.to_numeric(stations_df["train_capacity.train_total"], errors="coerce").fillna(556)
elif "train_capacity" in stations_df.columns:
    # try to read nested dict
    def _cap(x):
        try:
            return float(x.get("train_total", 556))
        except Exception:
            return 556.0
    train_total_cap = stations_df["train_capacity"].apply(_cap)
else:
    train_total_cap = 556.0

stations_df["_served_lines_count"] = served_counts.astype(int).clip(lower=1)
stations_df["_train_total_cap"]    = pd.to_numeric(train_total_cap, errors="coerce").fillna(556.0)

capacity_df = stations_df[[
    "station_id","capacity_station","capacity_platform","_served_lines_count","_train_total_cap"
]].copy()

# Quick SID maps
sid_by_code, sid_by_name = {}, {}
for st in stations_list:
    sid  = str(st.get("station_id","")).strip()
    code = str(st.get("code","")).strip()
    name = str(st.get("name","")).strip()
    if code: sid_by_code[_norm(code)] = sid
    if name: sid_by_name[_norm(name)] = sid

ALIASES = {
    "AIRPORT T1-2": "AIRP_T12",
    "QASR AL-HOKM": "QASR",
    "NATIONAL MUSEUM": "MUSEUM",
    "WESTERN STATION": "S6",
}
def resolve_sid(token: str):
    t = _norm(token)
    if t in sid_by_code: return sid_by_code[t]
    if t in sid_by_name: return sid_by_name[t]
    if t in ALIASES:
        c = _norm(ALIASES[t]); return sid_by_code.get(c, ALIASES[t])
    return None

# -------------------- 3) Calendar (events & holidays) --------------------
def norm_date(x: str) -> str:
    if x is None: return ""
    s = str(x).strip()
    if not s: return ""
    d = pd.to_datetime(s, errors="coerce", dayfirst=False)
    if pd.isna(d):
        d = pd.to_datetime(s, errors="coerce", dayfirst=True)
    return "" if pd.isna(d) else d.strftime("%Y-%m-%d")

# Events
events_csv = f"{SEED}/calendar_events.csv"
event_rows = []
with open(events_csv, "r", encoding="utf-8") as f:
    rdr = csv.DictReader(f)
    cols = {c.lower().strip(): c for c in rdr.fieldnames}
    for r in rdr:
        event_rows.append({
            "date": norm_date(r.get(cols.get("date","date"), "")),
            "event_type": (r.get(cols.get("event_type","event_type")) or r.get(cols.get("type","type")) or "Other").strip(),
            "stations_impacted": (r.get(cols.get("stations_impacted","stations_impacted")) or r.get(cols.get("stations","stations")) or "*").strip(),
            "demand_modifier": float((r.get(cols.get("demand_modifier","demand_modifier")) or "1.0")),
        })

GLOBAL_EVENT_TYPES = {"SaudiNationalDay"}
event_types_map = {}              # (date, SID) -> set(types)
event_mult_override = {}          # (date, SID) -> product(mods)
global_event_types_by_date = {}   # date -> set(types)
global_event_mult_by_date  = {}   # date -> product(mods)

for e in event_rows:
    d = e["date"]
    if not d: continue
    etype = e["event_type"] or "Other"
    dm    = float(e.get("demand_modifier", 1.0) or 1.0)
    tokens = [s.strip() for s in (e["stations_impacted"] or "*").split(";")]

    is_global = (etype in GLOBAL_EVENT_TYPES) or any(_norm(t) in {"*","ALL","ALL STATIONS"} for t in tokens)
    if is_global:
        global_event_types_by_date.setdefault(d, set()).add(etype)
        global_event_mult_by_date[d] = global_event_mult_by_date.get(d, 1.0) * dm

    for tok in tokens:
        if tok == "" or _norm(tok) in {"*","ALL","ALL STATIONS"}: continue
        sid = resolve_sid(tok)
        if sid is None:
            print(f"[warn] Unknown station alias in events CSV: '{tok}'")
            continue
        key = (d, _norm(sid))
        event_types_map.setdefault(key, set()).add(etype)
        event_mult_override[key] = event_mult_override.get(key, 1.0) * dm

# Holidays
holiday_dates = set()
if HOLIDAYS_ON:
    holidays_csv = f"{SEED}/holidays.csv"
    if os.path.exists(holidays_csv):
        with open(holidays_csv, "r", encoding="utf-8") as f:
            rdr = csv.DictReader(f)
            cols = {c.lower().strip(): c for c in rdr.fieldnames}
            for r in rdr:
                d = norm_date(r.get(cols.get("date","date"), ""));
                if d: holiday_dates.add(d)

def list_event_types(date_str, sid):
    sidn = _norm(sid)
    types = set()
    if (date_str, sidn) in event_types_map:
        types |= event_types_map[(date_str, sidn)]
    if date_str in global_event_types_by_date:
        types |= global_event_types_by_date[date_str]
    return sorted(types)

def event_csv_multiplier(date_str, sid):
    sidn = _norm(sid)
    m = 1.0
    if (date_str, sidn) in event_mult_override:
        m *= event_mult_override[(date_str, sidn)]
    if date_str in global_event_mult_by_date:
        m *= global_event_mult_by_date[date_str]
    return float(m)

# -------------------- 4) Multipliers --------------------
mult_cfg     = (config.get("multipliers", {}) or {})
weather_mult = mult_cfg.get("weather", {}) or {}
events_mult  = mult_cfg.get("events", {}) or {}
weekend_mult = float(mult_cfg.get("weekend", 1.0))
holiday_mult = float(mult_cfg.get("holiday", 1.0)) if HOLIDAYS_ON else 1.0
COMBINE_MODE = "stack"  # multiply components

def build_modifier(row):
    m = 1.0
    # weekend
    if int(row.get("is_weekend",0)) == 1:
        m *= weekend_mult

    # holiday (optional)
    hol_m = holiday_mult if row["date"] in holiday_dates else 1.0

    # events
    ev_m = event_csv_multiplier(row["date"], row["station_id"])
    if ev_m == 1.0:
        tmp = 1.0
        for t in list_event_types(row["date"], row["station_id"]):
            tmp *= float(events_mult.get(t, events_mult.get("Other", 1.0)))
        ev_m = tmp if tmp != 1.0 else 1.0

    m = m * hol_m * ev_m if COMBINE_MODE == "stack" else m * max(hol_m, ev_m)

    # weather
    w = str(row.get("weather_code", "") or "")
    m *= float(weather_mult.get(w, 1.0))
    return float(m)

# -------------------- 5) Build the full month grid --------------------
dates = pd.date_range(MONTH_START, MONTH_END, freq="D")
frames = []
for d in dates:
    df_d = base_day.copy()
    h = (df_d["minute_of_day"] // 60).astype(int)
    m = (df_d["minute_of_day"] %  60).astype(int)
    date_iso = d.strftime("%Y-%m-%d")
    df_d["date"]        = date_iso
    df_d["timestamp"]   = pd.to_datetime(f"{date_iso} " + h.astype(str).str.zfill(2) + ":" + m.astype(str).str.zfill(2) + ":00")
    df_d["hour"]        = h
    df_d["day_of_week"] = d.weekday()
    df_d["is_weekend"]  = df_d["day_of_week"].isin([4,5]).astype(int)  # Fri=4, Sat=5
    frames.append(df_d)

df = pd.concat(frames, ignore_index=True).sort_values(["date","station_id","minute_of_day"]).reset_index(drop=True)

# -------------------- 6) Apply modifiers (still normalized 0..1) --------------------
df["modifier"] = df.apply(build_modifier, axis=1)
base_demand_norm = pd.to_numeric(df.get("base_demand", 0), errors="coerce").fillna(0.0)  # 0..1
df["demand_final"] = (base_demand_norm * pd.to_numeric(df["modifier"], errors="coerce").fillna(1.0)).fillna(0.0)

# -------------------- 7) Merge capacities, lines, train cap --------------------
df = df.merge(capacity_df, on="station_id", how="left")

# Effective directions (two per line)
dirs = (2 * pd.to_numeric(df["_served_lines_count"], errors="coerce").fillna(1).clip(lower=1)).astype(float)
df["_effective_dirs"] = dirs

# Train total capacity per train
df["_train_total_cap"] = pd.to_numeric(df["_train_total_cap"], errors="coerce").fillna(556.0)

# -------------------- 8) Headway (sec) from config --------------------
headway_cfg = config.get("headway", {})
peaks_cfg   = config.get("peaks", [])
peak_hours  = [int(x.get("hour")) for x in peaks_cfg if "hour" in x] or [7,8,17,18]

peak_hw_min    = float(np.median(headway_cfg.get("peak_pattern",    [7,7,6,8])))
offpeak_hw_min = float(np.median(headway_cfg.get("offpeak_pattern", [11,10,12,11])))

def hw_for_hour(h):
    return int(peak_hw_min*60) if int(h) in peak_hours else int(offpeak_hw_min*60)

df["headway_seconds"] = pd.to_numeric(df.get("headway_seconds"), errors="coerce")
mask = df["headway_seconds"].isna()
df.loc[mask, "headway_seconds"] = df.loc[mask, "hour"].apply(hw_for_hour)
df["headway_seconds"] = df["headway_seconds"].astype(int)

# -------------------- 9) Convert normalized demand → pax/min --------------------
# Assumptions (override via config.assumptions.* if present)
assumptions = config.get("assumptions", {}) or {}
TARGET_UTIL = float(assumptions.get("target_util_peak", 0.65))     # target utilization at peak minute
LOAD        = float(assumptions.get("boarding_load_factor", 0.60)) # proportion boarding per train

# Peak headway in minutes (used for scaling)
PEAK_HW_MIN = peak_hw_min

# Service capacity at peak (pax/min): μ_peak = (directions / peak_hw) * train_capacity * LOAD
mu_peak = (df["_effective_dirs"] / PEAK_HW_MIN) * df["_train_total_cap"] * LOAD

# Peak normalized demand per station after modifiers (proxy for "peak minute" in base curve)
max_base = df.groupby("station_id")["demand_final"].transform("max").replace(0, np.nan)

# Station-specific scale factor so that peak minute ≈ TARGET_UTIL * μ_peak
scale = (TARGET_UTIL * mu_peak) / max_base
scale = scale.fillna(0.0)

# Final arrivals r (pax/min)
df["station_flow_per_min"] = df["demand_final"] * scale

# -------------------- 10) Occupancy model (concourse + platform) --------------------
# Dwell assumptions (override from config.assumptions.* if available)
CONCOURSE_DWELL_MIN = float(assumptions.get("concourse_dwell_min", 6.0))
PLATFORM_DWELL_MIN  = float(assumptions.get("platform_dwell_min", 3.0))

# Base occupancy (Little's Law approx.): Occ ≈ λ * (Tc + Tp)
df["_occ_concourse"] = df["station_flow_per_min"] * CONCOURSE_DWELL_MIN
df["_occ_platform"]  = df["station_flow_per_min"] * PLATFORM_DWELL_MIN
df["_occ_base"]      = df["_occ_concourse"] + df["_occ_platform"]

# Optional: add a small queueing bump when headway is worse than peak
current_hw_min = df["headway_seconds"] / 60.0
hw_bump = np.maximum(current_hw_min - PEAK_HW_MIN, 0) / np.maximum(PEAK_HW_MIN, 1e-9)
df["_occ_queue_bump"] = df["_occ_base"] * 0.25 * hw_bump  # 25% sensitivity (tune if needed)

# Total occupancy before capping
df["_occ_total"] = df["_occ_base"] + df["_occ_queue_bump"]

# -------------------- 11) Map to final outputs --------------------
# Cap occupancy to station capacity (allow +10% on special events)
cap_station = pd.to_numeric(df["capacity_station"], errors="coerce").fillna(0.0)
event_boost = np.where(df.apply(lambda r: event_csv_multiplier(r["date"], r["station_id"])>1.0, axis=1), 1.10, 1.00)

df["station_total"] = np.round(
    np.minimum(df["_occ_total"], cap_station * event_boost)
).astype(int).clip(lower=0)

def crowd_from_cap(row):
    cap = float(row.get("capacity_station") or 0)
    x = float(row.get("station_total") or 0)
    if cap <= 0: return "Medium"
    r = x / cap
    if   r < 0.30: return "Low"
    elif r < 0.60: return "Medium"
    elif r < 0.85: return "High"
    else:          return "Extreme"

df["crowd_level"] = df.apply(crowd_from_cap, axis=1)

# flags / types (read from calendar CSV)
df["special_event_type"] = df.apply(lambda r: "+".join(list_event_types(r["date"], r["station_id"])) or "None", axis=1)
df["event_flag"]   = (df["special_event_type"] != "None").astype(int)
df["holiday_flag"] = df["date"].isin(holiday_dates).astype(int) if HOLIDAYS_ON else 0

# -------------------- 12) Output (monthly + optional daily) --------------------
FINAL_SCHEMA = [
    "date","timestamp","hour","minute_of_day","day_of_week","is_weekend",
    "station_id",
    "base_demand","modifier","demand_final",
    "station_flow_per_min",          # arrivals (pax/min) after scaling
    "station_total","crowd_level",
    "special_event_type","event_flag","holiday_flag",
    "headway_seconds"
]
for c in FINAL_SCHEMA:
    if c not in df.columns:
        df[c] = np.nan
out = df[FINAL_SCHEMA].sort_values(["date","station_id","minute_of_day"]).reset_index(drop=True)

# QA
assert out["station_id"].notna().all()
assert (out["station_total"] >= 0).all()

# Save monthly file
OUT_MONTH = f"{OUT_DIR}/cf_month_{MONTH}.csv"
out.to_csv(OUT_MONTH, index=False, encoding="utf-8-sig")
print("Saved ✓", OUT_MONTH, "| Rows:", len(out), "| Dates:", out['date'].min(), "→", out['date'].max())

# (Optional) per-day files
if SAVE_DAILY:
    for d, g in out.groupby("date", sort=True):
        p = f"{OUT_DIR}/cf_day_{d}.csv"
        g.to_csv(p, index=False, encoding="utf-8-sig")
    print("Daily CSVs saved in:", OUT_DIR)


Loaded base-day from: /content/2025_GP_28/masar-sim/data/generated/base_day.csv
Saved ✓ /content/2025_GP_28/masar-sim/data/generated/cf_month_2025-09.csv | Rows: 194580 | Dates: 2025-09-01 → 2025-09-30


In [6]:
from google.colab import files
files.download(f"{OUT_DIR}/cf_month_2025-09.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>